aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Kconfig.binfmt13
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/binfmt_elf.c178
-rw-r--r--fs/binfmt_elf_fdpic.c20
-rw-r--r--fs/binfmt_elf_test.c64
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/btrfs/check-integrity.c1
-rw-r--r--fs/btrfs/ctree.h12
-rw-r--r--fs/btrfs/disk-io.c20
-rw-r--r--fs/btrfs/extent-tree.c10
-rw-r--r--fs/btrfs/extent_io.c22
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file.c97
-rw-r--r--fs/btrfs/inode.c32
-rw-r--r--fs/btrfs/ioctl.c256
-rw-r--r--fs/btrfs/lzo.c11
-rw-r--r--fs/btrfs/qgroup.c9
-rw-r--r--fs/btrfs/relocation.c13
-rw-r--r--fs/btrfs/root-tree.c15
-rw-r--r--fs/btrfs/subpage.c2
-rw-r--r--fs/btrfs/transaction.c65
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-checker.c18
-rw-r--r--fs/btrfs/tree-log.c61
-rw-r--r--fs/buffer.c14
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/xattr.c23
-rw-r--r--fs/cifs/cifs_swn.c6
-rw-r--r--fs/cifs/cifsfs.c14
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/connect.c45
-rw-r--r--fs/cifs/dfs_cache.c2
-rw-r--r--fs/cifs/file.c10
-rw-r--r--fs/cifs/smb1ops.c2
-rw-r--r--fs/cifs/smb2ops.c18
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/configfs/dir.c14
-rw-r--r--fs/coredump.c86
-rw-r--r--fs/crypto/bio.c13
-rw-r--r--fs/crypto/crypto.c8
-rw-r--r--fs/crypto/inline_crypt.c93
-rw-r--r--fs/dax.c1
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/erofs/data.c12
-rw-r--r--fs/erofs/dir.c21
-rw-r--r--fs/erofs/erofs_fs.h5
-rw-r--r--fs/erofs/inode.c4
-rw-r--r--fs/erofs/internal.h4
-rw-r--r--fs/erofs/namei.c54
-rw-r--r--fs/erofs/super.c21
-rw-r--r--fs/erofs/sysfs.c8
-rw-r--r--fs/erofs/zdata.c189
-rw-r--r--fs/erofs/zmap.c71
-rw-r--r--fs/exec.c32
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/inode.c7
-rw-r--r--fs/ext4/page-io.c13
-rw-r--r--fs/ext4/readpage.c8
-rw-r--r--fs/f2fs/Kconfig7
-rw-r--r--fs/f2fs/acl.c21
-rw-r--r--fs/f2fs/checkpoint.c58
-rw-r--r--fs/f2fs/compress.c11
-rw-r--r--fs/f2fs/data.c175
-rw-r--r--fs/f2fs/debug.c25
-rw-r--r--fs/f2fs/dir.c12
-rw-r--r--fs/f2fs/f2fs.h162
-rw-r--r--fs/f2fs/file.c175
-rw-r--r--fs/f2fs/gc.c53
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c7
-rw-r--r--fs/f2fs/namei.c78
-rw-r--r--fs/f2fs/node.c92
-rw-r--r--fs/f2fs/node.h3
-rw-r--r--fs/f2fs/recovery.c35
-rw-r--r--fs/f2fs/segment.c73
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c91
-rw-r--r--fs/f2fs/sysfs.c40
-rw-r--r--fs/f2fs/verity.c4
-rw-r--r--fs/f2fs/xattr.c12
-rw-r--r--fs/fs-writeback.c6
-rw-r--r--fs/fuse/dev.c12
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/fuse/inode.c3
-rw-r--r--fs/fuse/ioctl.c9
-rw-r--r--fs/gfs2/lops.c8
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hfs/mdb.c2
-rw-r--r--fs/hfsplus/wrapper.c5
-rw-r--r--fs/internal.h4
-rw-r--r--fs/io-wq.c114
-rw-r--r--fs/io_uring.c1295
-rw-r--r--fs/iomap/buffered-io.c26
-rw-r--r--fs/iomap/direct-io.c14
-rw-r--r--fs/jfs/jfs_logmgr.c11
-rw-r--r--fs/jfs/jfs_metapage.c9
-rw-r--r--fs/ksmbd/vfs.c1
-rw-r--r--fs/lockd/svc.c24
-rw-r--r--fs/mpage.c80
-rw-r--r--fs/nfs/blocklayout/blocklayout.c26
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c1
-rw-r--r--fs/nfs/callback.c66
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/filecache.c6
-rw-r--r--fs/nfsd/flexfilelayout.c2
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4state.c20
-rw-r--r--fs/nfsd/nfs4xdr.c10
-rw-r--r--fs/nfsd/nfscache.c33
-rw-r--r--fs/nfsd/nfsctl.c10
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsfh.h20
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c25
-rw-r--r--fs/nfsd/trace.h107
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nilfs2/segbuf.c47
-rw-r--r--fs/ntfs3/fsntfs.c36
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/super.c22
-rw-r--r--fs/pipe.c11
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/pstore/platform.c38
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/squashfs/block.c11
-rw-r--r--fs/stat.c49
-rw-r--r--fs/tracefs/inode.c5
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/xfs/xfs_bio_io.c14
-rw-r--r--fs/xfs/xfs_buf.c4
-rw-r--r--fs/xfs/xfs_log.c14
-rw-r--r--fs/xfs/xfs_super.c7
-rw-r--r--fs/zonefs/super.c9
145 files changed, 3394 insertions, 1810 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 6c7dc1387beb..90fdb62545e0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -344,7 +344,7 @@ config LOCKD
config LOCKD_V4
bool
- depends on NFSD_V3 || NFS_V3
+ depends on NFSD || NFS_V3
depends on FILE_LOCKING
default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4d5ae61580aa..21c6332fa785 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -28,6 +28,16 @@ config BINFMT_ELF
ld.so (check the file <file:Documentation/Changes> for location and
latest version).
+config BINFMT_ELF_KUNIT_TEST
+ bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y && BINFMT_ELF=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the ELF loader KUnit tests, which try to gather
+ prior bug fixes into a regression test collection. This is really
+ only needed for debugging. Note that with CONFIG_COMPAT=y, the
+ compat_binfmt_elf KUnit test is also created.
+
config COMPAT_BINFMT_ELF
def_bool y
depends on COMPAT && BINFMT_ELF
@@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_BINFMT_ELF_EXTRA_PHDRS
+ bool
+
config ARCH_HAVE_ELF_PROT
bool
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e9157d0da29..f447c902318d 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -703,7 +703,7 @@ static int afs_writepages_region(struct address_space *mapping,
struct folio *folio;
struct page *head_page;
ssize_t ret;
- int n;
+ int n, skips = 0;
_enter("%llx,%llx,", start, end);
@@ -754,8 +754,15 @@ static int afs_writepages_region(struct address_space *mapping,
#ifdef CONFIG_AFS_FSCACHE
folio_wait_fscache(folio);
#endif
+ } else {
+ start += folio_size(folio);
}
folio_put(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched())
+ break;
+ skips++;
+ }
continue;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9e11e6f13e83..6556e13ed95f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_CORE_EFLAGS 0
#endif
-#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1))
#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
@@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
+#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
@@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss)
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
- int i, first_idx = -1, last_idx = -1;
+ elf_addr_t min_addr = -1;
+ elf_addr_t max_addr = 0;
+ bool pt_load = false;
+ int i;
for (i = 0; i < nr; i++) {
- if (cmds[i].p_type == PT_LOAD) {
- last_idx = i;
- if (first_idx == -1)
- first_idx = i;
+ if (phdr[i].p_type == PT_LOAD) {
+ min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr));
+ max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz);
+ pt_load = true;
}
}
- if (first_idx == -1)
- return 0;
-
- return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
- ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ return pt_load ? (max_addr - min_addr) : 0;
}
static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
@@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
- int load_addr_set = 0;
+ unsigned long load_bias = 0, phdr_addr = 0;
+ int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
@@ -1074,12 +1075,12 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * The first time through the loop, load_addr_set is false:
+ * The first time through the loop, first_pt_load is true:
* layout will be calculated. Once set, use MAP_FIXED since
* we know we've already safely mapped the entire region with
* MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (load_addr_set) {
+ if (!first_pt_load) {
elf_flags |= MAP_FIXED;
} else if (elf_ex->e_type == ET_EXEC) {
/*
@@ -1135,14 +1136,25 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
- }
- /*
- * Calculate the entire size of the ELF mapping (total_size).
- * (Note that load_addr_set is set to true later once the
- * initial mapping is performed.)
- */
- if (!load_addr_set) {
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
+ *
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1159,16 +1171,25 @@ out_free_interp:
goto out_free_dentry;
}
- if (!load_addr_set) {
- load_addr_set = 1;
- load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ if (first_pt_load) {
+ first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
- load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1204,6 +1225,7 @@ out_free_interp:
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1267,8 +1289,8 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
@@ -1619,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static int fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
user_long_t *start_end_ofs;
char *name_base, *name_curpos;
+ int i;
/* *Estimated* file count and total data size needed */
- count = mm->map_count;
+ count = cprm->vma_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1651,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note)
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = &cprm->vma_meta[i];
struct file *file;
const char *filename;
- file = vma->vm_file;
+ file = m->file;
if (!file)
continue;
filename = file_path(file, name_curpos, remaining);
@@ -1675,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note)
memmove(name_curpos, filename, n);
name_curpos += n;
- *start_end_ofs++ = vma->vm_start;
- *start_end_ofs++ = vma->vm_end;
- *start_end_ofs++ = vma->vm_pgoff;
+ *start_end_ofs++ = m->start;
+ *start_end_ofs++ = m->end;
+ *start_end_ofs++ = m->pgoff;
count++;
}
@@ -1688,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note)
* Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = mm->map_count - count;
+ n = cprm->vma_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -1744,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task,
static int fill_thread_core_info(struct elf_thread_core_info *t,
const struct user_regset_view *view,
- long signr, size_t *total)
+ long signr, struct elf_note_info *info)
{
- unsigned int i;
+ unsigned int note_iter, view_iter;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1760,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
- *total += notesize(&t->notes[0]);
+ info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
/*
* Each other regset might generate a note too. For each regset
- * that has no core_note_type or is inactive, we leave t->notes[i]
- * all zero and we'll know to skip writing it later.
+ * that has no core_note_type or is inactive, skip it.
*/
- for (i = 1; i < view->n; ++i) {
- const struct user_regset *regset = &view->regsets[i];
+ note_iter = 1;
+ for (view_iter = 1; view_iter < view->n; ++view_iter) {
+ const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
@@ -1786,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (ret < 0)
continue;
+ if (WARN_ON_ONCE(note_iter >= info->thread_notes))
+ break;
+
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
note_type, ret, data);
- *total += notesize(&t->notes[i]);
+ info->size += notesize(&t->notes[note_iter]);
+ note_iter++;
}
return 1;
@@ -1800,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1872,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info))
return 0;
/*
@@ -1881,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
- fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo);
info->size += notesize(&info->signote);
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- if (fill_files_note(&info->files) == 0)
+ if (fill_files_note(&info->files, cprm) == 0)
info->size += notesize(&info->files);
return 1;
@@ -2029,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
+ elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
/* Set up header */
fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2072,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
fill_auxv_note(info->notes + 3, current->mm);
info->numnote = 4;
- if (fill_files_note(info->notes + info->numnote) == 0) {
+ if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
info->notes_files = info->notes + info->numnote;
info->numnote++;
}
/* Try to dump the FPU. */
- info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
- info->fpu);
+ info->prstatus->pr_fpvalid =
+ elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
if (info->prstatus->pr_fpvalid)
fill_note(info->notes + info->numnote++,
"CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
@@ -2169,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs, i;
- size_t vma_data_size;
+ int segs, i;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2178,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- struct core_vma_metadata *vma_meta;
-
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- return 0;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2201,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
has_dumped = 1;
@@ -2226,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2246,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
@@ -2284,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Align to page */
dump_skip_to(cprm, dataoff);
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
@@ -2302,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
@@ -2324,3 +2344,7 @@ static void __exit exit_elf_binfmt(void)
core_initcall(init_elf_binfmt);
module_exit(exit_elf_binfmt);
MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
+#include "binfmt_elf_test.c"
+#endif
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c6f588dc4a9d..08d0c8797828 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = {
.load_binary = load_elf_fdpic_binary,
#ifdef CONFIG_ELF_CORE
.core_dump = elf_fdpic_core_dump,
-#endif
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
static int __init init_elf_fdpic_binfmt(void)
@@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs;
+ int segs;
int i;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
- struct core_vma_metadata *vma_meta = NULL;
- size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- goto end_coredump;
-
for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
size_t sz;
@@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dump_skip_to(cprm, dataoff);
- if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
+ if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1652,7 +1647,6 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
- kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c
new file mode 100644
index 000000000000..11d734fec366
--- /dev/null
+++ b/fs/binfmt_elf_test.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+static void total_mapping_size_test(struct kunit *test)
+{
+ struct elf_phdr empty[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, },
+ { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, },
+ };
+ /*
+ * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \
+ * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}'
+ */
+ struct elf_phdr mount[] = {
+ { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, },
+ { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, },
+ { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, },
+ { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, },
+ { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, },
+ };
+ size_t mount_size = 0xE070;
+ /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */
+ struct elf_phdr unordered[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ };
+
+ /* No headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0);
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0);
+ /* Empty headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0);
+ /* No PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0);
+ /* Empty PT_LOAD and non-PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0);
+
+ /* Normal set of PT_LOADS, and expected size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size);
+ /* Unordered PT_LOADs result in same size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size);
+}
+
+static struct kunit_case binfmt_elf_test_cases[] = {
+ KUNIT_CASE(total_mapping_size_test),
+ {},
+};
+
+static struct kunit_suite binfmt_elf_test_suite = {
+ .name = KBUILD_MODNAME,
+ .test_cases = binfmt_elf_test_cases,
+};
+
+kunit_test_suite(binfmt_elf_test_suite);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d776f80ee50..626898150011 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -37,6 +37,7 @@
#include <linux/flat.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/coredump.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -97,13 +98,17 @@ static int load_flat_shared_library(int id, struct lib_info *p);
#endif
static int load_flat_binary(struct linux_binprm *);
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm);
+#endif
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
+#ifdef CONFIG_COREDUMP
.core_dump = flat_core_dump,
.min_coredump = PAGE_SIZE
+#endif
};
/****************************************************************************/
@@ -112,12 +117,14 @@ static struct linux_binfmt flat_format = {
* Currently only a stub-function.
*/
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm)
{
pr_warn("Process %s:%d received signr %d and should have core dumped\n",
current->comm, current->pid, cprm->siginfo->si_signo);
return 1;
}
+#endif
/****************************************************************************/
/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7e9f90fa0388..abac86a75840 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -78,7 +78,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8992e0096163..ebb2d109e8bb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -602,6 +602,9 @@ enum {
/* Indicate that we want the transaction kthread to commit right now. */
BTRFS_FS_COMMIT_TRANS,
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -1106,8 +1109,15 @@ enum {
BTRFS_ROOT_QGROUP_FLUSHING,
/* We started the orphan cleanup for this root. */
BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
};
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
@@ -3291,7 +3301,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
+ struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87a5addbedf6..b3e9cf3fd1dd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+ /* Kick the cleaner thread so it'll start deleting snapshots. */
+ if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
+
clear_oneshot:
btrfs_clear_oneshot_options(fs_info);
return 0;
@@ -4029,8 +4033,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to do I/O, so we don't lose the ability to do integrity
* checking.
*/
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, device->bdev);
+ bio = bio_alloc(device->bdev, 1,
+ REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ GFP_NOFS);
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
@@ -4042,7 +4047,6 @@ static int write_dev_supers(struct btrfs_device *device,
* go down lazy and there's a short window where the on-disk
* copies might still contain the older version.
*/
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
@@ -4154,10 +4158,8 @@ static void write_dev_flush(struct btrfs_device *device)
return;
#endif
- bio_reset(bio);
+ bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
- bio_set_dev(bio, device->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
@@ -4538,6 +4540,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
+ /*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d89273c4b6b8..96427b1ecac3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
int ret;
int level;
bool root_dropped = false;
+ bool unfinished_drop = false;
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
@@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
+ unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
@@ -5839,6 +5842,13 @@ out_free:
btrfs_free_path(path);
out:
/*
+ * We were an unfinished drop root, check to see if there are any
+ * pending, and if not clear and wake up any waiters.
+ */
+ if (!err && unfinished_drop)
+ btrfs_maybe_wake_unfinished_drop(fs_info);
+
+ /*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
* keep trying to do the work later. This also cleans up roots if we
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 409bad3928db..5923eec8caa8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3143,7 +3143,7 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
struct bio *bio;
ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
- bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+ bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
btrfs_bio_init(btrfs_bio(bio));
return bio;
}
@@ -3154,7 +3154,7 @@ struct bio *btrfs_bio_clone(struct bio *bio)
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
- new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
+ new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(new);
btrfs_bio_init(bbio);
bbio->iter = bio->bi_iter;
@@ -3169,7 +3169,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
/* this will never fail when it's backed by a bioset */
- bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
bbio = btrfs_bio(bio);
@@ -6841,14 +6841,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ /*
+ * If we are using the commit root we could potentially clear a page
+ * Uptodate while we're using the extent buffer that we've previously
+ * looked up. We don't want to complain in this case, as the page was
+ * valid before, we just didn't write it out. Instead we want to catch
+ * the case where we didn't actually read the block properly, which
+ * would have !PageUptodate && !PageError, as we clear PageError before
+ * reading.
+ */
if (fs_info->sectorsize < PAGE_SIZE) {
- bool uptodate;
+ bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
- WARN_ON(!uptodate);
+ error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
+ WARN_ON(!uptodate && !error);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!PageUptodate(page) && !PageError(page));
}
}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 5a36add21305..c28ceddefae4 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
free_extent_map(merge);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e217337dff9..d2fa32ffe304 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -25,6 +25,8 @@ enum {
EXTENT_FLAG_FILLING,
/* filesystem extent mapping type */
EXTENT_FLAG_FS_MAPPING,
+ /* This em is merged from two or more physically adjacent ems */
+ EXTENT_FLAG_MERGED,
};
struct extent_map {
@@ -40,6 +42,12 @@ struct extent_map {
u64 ram_bytes;
u64 block_start;
u64 block_len;
+
+ /*
+ * Generation of the extent map, for merged em it's the highest
+ * generation of all merged ems.
+ * For non-merged extents, it's from btrfs_file_extent_item::generation.
+ */
u64 generation;
unsigned long flags;
/* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 11204dbbe053..a0179cc62913 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -50,11 +50,14 @@ struct inode_defrag {
/* root objectid */
u64 root;
- /* last offset we were able to defrag */
- u64 last_offset;
-
- /* if we've wrapped around back to zero once already */
- int cycled;
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents,
+ * thus needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
@@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
- if (defrag->last_offset > entry->last_offset)
- entry->last_offset = defrag->last_offset;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
return -EEXIST;
}
}
@@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
@@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
}
/*
- * Requeue the defrag object. If there is a defrag object that points to
- * the same inode in the tree, we will merge them together (by
- * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
- */
-static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- goto out;
-
- /*
- * Here we don't check the IN_DEFRAG flag, because we need merge
- * them together.
- */
- spin_lock(&fs_info->defrag_inodes_lock);
- ret = __btrfs_add_inode_defrag(inode, defrag);
- spin_unlock(&fs_info->defrag_inodes_lock);
- if (ret)
- goto out;
- return;
-out:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-}
-
-/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
@@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_root *inode_root;
struct inode *inode;
struct btrfs_ioctl_defrag_range_args range;
- int num_defrag;
- int ret;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
/* get the inode */
inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
@@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
goto cleanup;
}
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
- range.start = defrag->last_offset;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
sb_start_write(fs_info->sb);
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == BTRFS_DEFRAG_BATCH) {
- defrag->last_offset = range.start;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
-
iput(inode);
- return 0;
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
cleanup:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b2403b6127f..5bbea5ec31fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -560,12 +560,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
- u64 start, u64 end, u64 num_bytes, u64 small_write)
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode);
+ btrfs_add_inode_defrag(NULL, inode, small_write);
}
/*
@@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 927771d1853f..8d47ec5fc4f4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1012,8 +1012,155 @@ out:
return ret;
}
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
+
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
+ }
+
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
+
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
+ }
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
- bool locked)
+ u64 newer_than, bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -1028,6 +1175,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
if (!em) {
struct extent_state *cached = NULL;
u64 end = start + sectorsize - 1;
@@ -1035,7 +1196,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */
if (!locked)
lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
unlock_extent_cached(io_tree, start, end, &cached);
@@ -1046,23 +1207,42 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
return em;
}
+static u32 get_extent_max_capacity(const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return BTRFS_MAX_EXTENT_SIZE;
+}
+
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
bool locked)
{
struct extent_map *next;
- bool ret = true;
+ bool ret = false;
/* this is the last extent */
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len, locked);
+ /*
+ * We want to check if the next extent can be merged with the current
+ * one, which can be an extent created in a past generation, so we pass
+ * a minimum generation of 0 to defrag_lookup_extent().
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+ /* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- ret = false;
- else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > SZ_128K && next->block_len > SZ_128K))
- ret = false;
-
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(em))
+ goto out;
+ ret = true;
+out:
free_extent_map(next);
return ret;
}
@@ -1186,8 +1366,10 @@ struct defrag_target_range {
static int defrag_collect_targets(struct btrfs_inode *inode,
u64 start, u64 len, u32 extent_thresh,
u64 newer_than, bool do_compress,
- bool locked, struct list_head *target_list)
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
{
+ bool last_is_target = false;
u64 cur = start;
int ret = 0;
@@ -1197,7 +1379,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
bool next_mergeable = true;
u64 range_len;
- em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur,
+ newer_than, locked);
if (!em)
break;
@@ -1254,6 +1438,13 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (range_len >= extent_thresh)
goto next;
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(em))
+ goto next;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
locked);
if (!next_mergeable) {
@@ -1272,6 +1463,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
}
add:
+ last_is_target = true;
range_len = min(extent_map_end(em), start + len) - cur;
/*
* This one is a good target, check if it can be merged into
@@ -1315,6 +1507,17 @@ next:
kfree(entry);
}
}
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
return ret;
}
@@ -1373,7 +1576,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
}
static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
- u32 extent_thresh, u64 newer_than, bool do_compress)
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
{
struct extent_state *cached_state = NULL;
struct defrag_target_range *entry;
@@ -1419,7 +1623,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
*/
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, true,
- &target_list);
+ &target_list, last_scanned_ret);
if (ret < 0)
goto unlock_extent;
@@ -1454,7 +1658,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
u64 start, u32 len, u32 extent_thresh,
u64 newer_than, bool do_compress,
unsigned long *sectors_defragged,
- unsigned long max_sectors)
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
{
const u32 sectorsize = inode->root->fs_info->sectorsize;
struct defrag_target_range *entry;
@@ -1465,7 +1670,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, false,
- &target_list);
+ &target_list, NULL);
if (ret < 0)
goto out;
@@ -1482,6 +1687,15 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
range_len = min_t(u32, range_len,
(max_sectors - *sectors_defragged) * sectorsize);
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
if (ra)
page_cache_sync_readahead(inode->vfs_inode.i_mapping,
ra, NULL, entry->start >> PAGE_SHIFT,
@@ -1494,7 +1708,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
* accounting.
*/
ret = defrag_one_range(inode, entry->start, range_len,
- extent_thresh, newer_than, do_compress);
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
if (ret < 0)
break;
*sectors_defragged += range_len >>
@@ -1505,6 +1720,8 @@ out:
list_del_init(&entry->list);
kfree(entry);
}
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
return ret;
}
@@ -1590,6 +1807,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
u64 cluster_end;
/* The cluster size 256K should always be page aligned */
@@ -1619,8 +1837,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
BTRFS_I(inode)->defrag_compress = compress_type;
ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress,
- &sectors_defragged, max_to_defrag);
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1628,7 +1846,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
- cur = cluster_end + 1;
+ cur = max(cluster_end + 1, last_scanned);
if (ret > 0) {
ret = 0;
break;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 0fb90cbe7669..e6e28a9c7987 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -380,6 +380,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
kunmap(cur_page);
cur_in += LZO_LEN;
+ if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) {
+ /*
+ * seg_len shouldn't be larger than we have allocated
+ * for workspace->cbuf
+ */
+ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ seg_len);
+ ret = -EIO;
+ goto out;
+ }
+
/* Copy the compressed segment payload into workspace */
copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index f12dc687350c..30d42ea655ce 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1197,13 +1197,20 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
goto out;
/*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
* Request qgroup rescan worker to complete and wait for it. This wait
* must be done before transaction start for quota disable since it may
* deadlock with transaction by the qgroup rescan worker.
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* 1 For the root item
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5465197996d..9d8054839782 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
+ /*
+ * This only gets set if we had a half-deleted snapshot on mount. We
+ * cannot allow relocation to start while we're still trying to clean up
+ * these pending deletions.
+ */
+ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
+ if (ret)
+ return ret;
+
+ /* We may have been woken up by close_ctree, so bail if we're closing. */
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3d68d2dcd83e..ca7426ef61c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
+ struct btrfs_key drop_key;
+
+ btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
+ /*
+ * If we have a non-zero drop_progress then we know we
+ * made it partly through deleting this snapshot, and
+ * thus we need to make sure we block any balance from
+ * happening until this snapshot is completely dropped.
+ */
+ if (drop_key.objectid != 0 || drop_key.type != 0 ||
+ drop_key.offset != 0) {
+ set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+ set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+ }
+
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 29bd8c7a7706..ef7ae20d2b77 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers))
+ if (atomic_read(&subpage->writers) == 0)
/* No writers, locked by plain lock_page() */
return unlock_page(page);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c3cfdfd8de9b..1f1c25db6f6b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_transaction *commit,
const enum btrfs_trans_state min_state)
{
- wait_event(commit->commit_wait, commit->state >= min_state);
+ struct btrfs_fs_info *fs_info = commit->fs_info;
+ u64 transid = commit->transid;
+ bool put = false;
+
+ while (1) {
+ wait_event(commit->commit_wait, commit->state >= min_state);
+ if (put)
+ btrfs_put_transaction(commit);
+
+ if (min_state < TRANS_STATE_COMPLETED)
+ break;
+
+ /*
+ * A transaction isn't really completed until all of the
+ * previous transactions are completed, but with fsync we can
+ * end up with SUPER_COMMITTED transactions before a COMPLETED
+ * transaction. Wait for those.
+ */
+
+ spin_lock(&fs_info->trans_lock);
+ commit = list_first_entry_or_null(&fs_info->trans_list,
+ struct btrfs_transaction,
+ list);
+ if (!commit || commit->transid > transid) {
+ spin_unlock(&fs_info->trans_lock);
+ break;
+ }
+ refcount_inc(&commit->use_count);
+ put = true;
+ spin_unlock(&fs_info->trans_lock);
+ }
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -1320,6 +1350,32 @@ again:
}
/*
+ * If we had a pending drop we need to see if there are any others left in our
+ * dead roots list, and if not clear our bit and wake any waiters.
+ */
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * We put the drop in progress roots at the front of the list, so if the
+ * first entry doesn't have UNFINISHED_DROP set we can wake everybody
+ * up.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root,
+ root_list);
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
+ spin_unlock(&fs_info->trans_lock);
+ return;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_wake_unfinished_drop(fs_info);
+}
+
+/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
@@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
- list_add_tail(&root->root_list, &fs_info->dead_roots);
+
+ /* We want to process the partially complete drops first. */
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
+ list_add(&root->root_list, &fs_info->dead_roots);
+ else
+ list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 9402d8d94484..ba8a9826eb37 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -216,6 +216,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 9fd145f1c4bc..aae5697dde32 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1682,6 +1682,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
*/
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
+ u64 item_data_end;
int ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -1696,6 +1697,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return -EUCLEAN;
}
+ item_data_end = (u64)btrfs_item_offset(leaf, slot) +
+ btrfs_item_size(leaf, slot);
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
@@ -1706,11 +1709,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
else
item_end_expected = btrfs_item_offset(leaf,
slot - 1);
- if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
+ if (unlikely(item_data_end != item_end_expected)) {
generic_err(leaf, slot,
- "unexpected item end, have %u expect %u",
- btrfs_item_data_end(leaf, slot),
- item_end_expected);
+ "unexpected item end, have %llu expect %u",
+ item_data_end, item_end_expected);
return -EUCLEAN;
}
@@ -1719,12 +1721,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (unlikely(btrfs_item_data_end(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info))) {
+ if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
- "slot end outside of leaf, have %u expect range [0, %u]",
- btrfs_item_data_end(leaf, slot),
- BTRFS_LEAF_DATA_SIZE(fs_info));
+ "slot end outside of leaf, have %llu expect range [0, %u]",
+ item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3ee014c06b82..6bc8834ac8f7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1362,6 +1362,15 @@ again:
inode, name, namelen);
kfree(name);
iput(dir);
+ /*
+ * Whenever we need to check if a name exists or not, we
+ * check the subvolume tree. So after an unlink we must
+ * run delayed items, so that future checks for a name
+ * during log replay see that the name does not exists
+ * anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
@@ -1614,6 +1623,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
+ /*
+ * Whenever we need to check if a name exists or
+ * not, we check the subvolume tree. So after an
+ * unlink we must run delayed items, so that future
+ * checks for a name during log replay see that the
+ * name does not exists anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
@@ -4635,7 +4653,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -5414,6 +5432,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5434,13 +5453,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5471,10 +5498,8 @@ again:
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5527,9 +5552,21 @@ next_key:
break;
}
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 8e112b6bd371..a17c386a142c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3024,12 +3024,16 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
- bio = bio_alloc(GFP_NOIO, 1);
+ if (buffer_meta(bh))
+ op_flags |= REQ_META;
+ if (buffer_prio(bh))
+ op_flags |= REQ_PRIO;
+
+ bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -3038,12 +3042,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- if (buffer_meta(bh))
- op_flags |= REQ_META;
- if (buffer_prio(bh))
- op_flags |= REQ_PRIO;
- bio_set_op_attrs(bio, op, op_flags);
-
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(bio);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 51c968cd00a6..ae93cee9d25d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -254,7 +254,7 @@ static bool cachefiles_shorten_object(struct cachefiles_object *object,
ret = cachefiles_inject_write_error();
if (ret == 0)
ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE,
- new_size, dio_size);
+ new_size, dio_size - new_size);
if (ret < 0) {
trace_cachefiles_io_error(object, file_inode(file), ret,
cachefiles_trace_fallocate_error);
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 83f41bd0c3a9..35465109d9c4 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -28,6 +28,11 @@ struct cachefiles_xattr {
static const char cachefiles_xattr_cache[] =
XATTR_USER_PREFIX "CacheFiles.cache";
+struct cachefiles_vol_xattr {
+ __be32 reserved; /* Reserved, should be 0 */
+ __u8 data[]; /* netfs volume coherency data */
+} __packed;
+
/*
* set the state xattr on a cache file
*/
@@ -185,6 +190,7 @@ void cachefiles_prepare_to_write(struct fscache_cookie *cookie)
*/
bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
{
+ struct cachefiles_vol_xattr *buf;
unsigned int len = volume->vcookie->coherency_len;
const void *p = volume->vcookie->coherency;
struct dentry *dentry = volume->dentry;
@@ -192,10 +198,17 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
_enter("%x,#%d", volume->vcookie->debug_id, len);
+ len += sizeof(*buf);
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return false;
+ buf->reserved = cpu_to_be32(0);
+ memcpy(buf->data, p, len);
+
ret = cachefiles_inject_write_error();
if (ret == 0)
ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
- p, len, 0);
+ buf, len, 0);
if (ret < 0) {
trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret,
cachefiles_trace_setxattr_error);
@@ -209,6 +222,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
cachefiles_coherency_vol_set_ok);
}
+ kfree(buf);
_leave(" = %d", ret);
return ret == 0;
}
@@ -218,7 +232,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
*/
int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
{
- struct cachefiles_xattr *buf;
+ struct cachefiles_vol_xattr *buf;
struct dentry *dentry = volume->dentry;
unsigned int len = volume->vcookie->coherency_len;
const void *p = volume->vcookie->coherency;
@@ -228,6 +242,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
_enter("");
+ len += sizeof(*buf);
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -245,7 +260,9 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
"Failed to read xattr with error %zd", xlen);
}
why = cachefiles_coherency_vol_check_xattr;
- } else if (memcmp(buf->data, p, len) != 0) {
+ } else if (buf->reserved != cpu_to_be32(0)) {
+ why = cachefiles_coherency_vol_check_resv;
+ } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) {
why = cachefiles_coherency_vol_check_cmp;
} else {
why = cachefiles_coherency_vol_check_ok;
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index cdce1609c5c2..180c234c2f46 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch
switch (state) {
case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name);
- cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_AVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name);
- cifs_mark_tcp_ses_conns_for_reconnect(swnreg->tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_UNKNOWN:
cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name);
@@ -498,7 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
goto unlock;
}
- cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, false);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, false);
unlock:
mutex_unlock(&tcon->ses->server->srv_mutex);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 082c21478686..6e5246122ee2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -210,6 +210,9 @@ cifs_read_super(struct super_block *sb)
if (rc)
goto out_no_root;
/* tune readahead according to rsize if readahead size not set on mount */
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
if (cifs_sb->ctx->rasize)
sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE;
else
@@ -254,6 +257,9 @@ static void cifs_kill_sb(struct super_block *sb)
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct cached_fid *cfid;
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct tcon_link *tlink;
/*
* We ned to release all dentries for the cached directories
@@ -263,16 +269,18 @@ static void cifs_kill_sb(struct super_block *sb)
dput(cifs_sb->root);
cifs_sb->root = NULL;
}
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon) {
+ node = rb_first(root);
+ while (node != NULL) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ tcon = tlink_tcon(tlink);
cfid = &tcon->crfid;
mutex_lock(&cfid->fid_mutex);
if (cfid->dentry) {
-
dput(cfid->dentry);
cfid->dentry = NULL;
}
mutex_unlock(&cfid->fid_mutex);
+ node = rb_next(node);
}
kill_anon_super(sb);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d3701295402d..0df3b24a0bf4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -132,6 +132,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *out_buf,
int *bytes_returned);
void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels);
+void
cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session);
extern int cifs_reconnect(struct TCP_Server_Info *server,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 053cb449eb16..9964c3634322 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -163,10 +163,50 @@ static void cifs_resolve_server(struct work_struct *work)
}
/*
+ * Update the tcpStatus for the server.
+ * This is used to signal the cifsd thread to call cifs_reconnect
+ * ONLY cifsd thread should call cifs_reconnect. For any other
+ * thread, use this function
+ *
+ * @server: the tcp ses for which reconnect is needed
+ * @all_channels: if this needs to be done for all channels
+ */
+void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels)
+{
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ int i;
+
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!all_channels) {
+ pserver->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&cifs_tcp_ses_lock);
+ return;
+ }
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->chan_lock);
+ for (i = 0; i < ses->chan_count; i++)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&ses->chan_lock);
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+/*
* Mark all sessions and tcons for reconnect.
+ * IMPORTANT: make sure that this gets called only from
+ * cifsd thread. For any other thread, use
+ * cifs_signal_cifsd_for_reconnect
*
+ * @server: the tcp ses for which reconnect is needed
* @server needs to be previously set to CifsNeedReconnect.
- *
+ * @mark_smb_session: whether even sessions need to be marked
*/
void
cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
@@ -3924,7 +3964,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus != CifsNeedSessSetup) {
+ if ((server->tcpStatus != CifsNeedSessSetup) &&
+ (ses->status == CifsGood)) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 831f42458bf6..30e040da4f09 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach
}
cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- cifs_mark_tcp_ses_conns_for_reconnect(tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, true);
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e7af802dcfa6..a2723f7cb5e9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3740,6 +3740,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
@@ -4474,6 +4479,11 @@ static void cifs_readahead(struct readahead_control *ractl)
}
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index b2fb7bd11936..c71c9a44bef4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -228,7 +228,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
spin_unlock(&GlobalMid_Lock);
if (reconnect) {
- cifs_mark_tcp_ses_conns_for_reconnect(server, false);
+ cifs_signal_cifsd_for_reconnect(server, false);
}
return mid;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index af5d0830bc8a..891b11576e55 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -25,6 +25,7 @@
#include "smb2glob.h"
#include "cifs_ioctl.h"
#include "smbdirect.h"
+#include "fscache.h"
#include "fs_context.h"
/* Change credits for different ops and return the total number of credits */
@@ -3887,29 +3888,38 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
{
int rc;
unsigned int xid;
+ struct inode *inode;
struct cifsFileInfo *cfile = file->private_data;
+ struct cifsInodeInfo *cifsi;
__le64 eof;
xid = get_xid();
- if (off >= i_size_read(file->f_inode) ||
- off + len >= i_size_read(file->f_inode)) {
+ inode = d_inode(cfile->dentry);
+ cifsi = CIFS_I(inode);
+
+ if (off >= i_size_read(inode) ||
+ off + len >= i_size_read(inode)) {
rc = -EINVAL;
goto out;
}
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
- i_size_read(file->f_inode) - off - len, off);
+ i_size_read(inode) - off - len, off);
if (rc < 0)
goto out;
- eof = cpu_to_le64(i_size_read(file->f_inode) - len);
+ eof = cpu_to_le64(i_size_read(inode) - len);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
goto out;
rc = 0;
+
+ cifsi->server_eof = i_size_read(inode) - len;
+ truncate_setsize(inode, cifsi->server_eof);
+ fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
out:
free_xid(xid);
return rc;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index a4c3e027cca2..eeb1a699bd6f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -430,7 +430,7 @@ unmask:
* be taken as the remainder of this one. We need to kill the
* socket so the server throws away the partial SMB
*/
- cifs_mark_tcp_ses_conns_for_reconnect(server, false);
+ cifs_signal_cifsd_for_reconnect(server, false);
trace_smb3_partial_send_reconnect(server->CurrentMid,
server->conn_id, server->hostname);
}
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 95e72d271b95..8f0af4f62631 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -135,6 +135,8 @@
#define elf_format compat_elf_format
#define init_elf_binfmt init_compat_elf_binfmt
#define exit_elf_binfmt exit_compat_elf_binfmt
+#define binfmt_elf_test_cases compat_binfmt_elf_test_cases
+#define binfmt_elf_test_suite compat_binfmt_elf_test_suite
/*
* We share all the actual code with the native (64-bit) version.
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d3cd2a94d1e8..d1f9d2632202 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -34,6 +34,14 @@
*/
DEFINE_SPINLOCK(configfs_dirent_lock);
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ * But parent configfs_subsystem is NULL when config_item is root.
+ * Use this mutex when config_item is root.
+ */
+static DEFINE_MUTEX(configfs_subsystem_mutex);
+
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
{
@@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
sd = root->d_fsdata;
+ mutex_lock(&configfs_subsystem_mutex);
link_group(to_config_group(sd->s_element), group);
+ mutex_unlock(&configfs_subsystem_mutex);
inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
@@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
inode_unlock(d_inode(root));
if (err) {
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
put_fragment(frag);
@@ -1931,7 +1943,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
dput(dentry);
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c060c0a2d72..7ed7d601e5e0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -42,6 +42,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
+#include <linux/elf.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -53,6 +54,9 @@
#include <trace/events/sched.h>
+static bool dump_vma_snapshot(struct coredump_params *cprm);
+static void free_vma_snapshot(struct coredump_params *cprm);
+
static int core_uses_pid;
static unsigned int core_pipe_limit;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
@@ -531,6 +535,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* by any locks.
*/
.mm_flags = mm->flags,
+ .vma_meta = NULL,
};
audit_core_dumps(siginfo->si_signo);
@@ -745,6 +750,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
pr_info("Core dump to |%s disabled\n", cn.corename);
goto close_fail;
}
+ if (!dump_vma_snapshot(&cprm))
+ goto close_fail;
+
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
/*
@@ -758,6 +766,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
dump_emit(&cprm, "", 1);
}
file_end_write(cprm.file);
+ free_vma_snapshot(&cprm);
}
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
@@ -980,6 +989,8 @@ static bool always_dump_vma(struct vm_area_struct *vma)
return false;
}
+#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
+
/*
* Decide how much of @vma's contents should be included in a core dump.
*/
@@ -1039,9 +1050,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
* dump the first page to aid in determining what was mapped here.
*/
if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
- (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
- return PAGE_SIZE;
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+ /*
+ * ELF libraries aren't always executable.
+ * We'll want to check whether the mapping starts with the ELF
+ * magic, but not now - we're holding the mmap lock,
+ * so copy_from_user() doesn't work here.
+ * Use a placeholder instead, and fix it up later in
+ * dump_vma_snapshot().
+ */
+ return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
+ }
#undef FILTER
@@ -1078,18 +1100,29 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
return gate_vma;
}
+static void free_vma_snapshot(struct coredump_params *cprm)
+{
+ if (cprm->vma_meta) {
+ int i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct file *file = cprm->vma_meta[i].file;
+ if (file)
+ fput(file);
+ }
+ kvfree(cprm->vma_meta);
+ cprm->vma_meta = NULL;
+ }
+}
+
/*
* Under the mmap_lock, take a snapshot of relevant information about the task's
* VMAs.
*/
-int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
- struct core_vma_metadata **vma_meta,
- size_t *vma_data_size_ptr)
+static bool dump_vma_snapshot(struct coredump_params *cprm)
{
struct vm_area_struct *vma, *gate_vma;
struct mm_struct *mm = current->mm;
int i;
- size_t vma_data_size = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1097,36 +1130,51 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
* mmap_lock in read mode.
*/
if (mmap_write_lock_killable(mm))
- return -EINTR;
+ return false;
+ cprm->vma_data_size = 0;
gate_vma = get_gate_vma(mm);
- *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+ cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0);
- *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
- if (!*vma_meta) {
+ cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL);
+ if (!cprm->vma_meta) {
mmap_write_unlock(mm);
- return -ENOMEM;
+ return false;
}
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma), i++) {
- struct core_vma_metadata *m = (*vma_meta) + i;
+ struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
m->end = vma->vm_end;
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+ m->pgoff = vma->vm_pgoff;
- vma_data_size += m->dump_size;
+ m->file = vma->vm_file;
+ if (m->file)
+ get_file(m->file);
}
mmap_write_unlock(mm);
- if (WARN_ON(i != *vma_count)) {
- kvfree(*vma_meta);
- return -EFAULT;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = cprm->vma_meta + i;
+
+ if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
+ char elfmag[SELFMAG];
+
+ if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
+ memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
+ m->dump_size = 0;
+ } else {
+ m->dump_size = PAGE_SIZE;
+ }
+ }
+
+ cprm->vma_data_size += m->dump_size;
}
- *vma_data_size_ptr = vma_data_size;
- return 0;
+ return true;
}
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index bfc2a5b74ed3..2217fe5ece6f 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -54,7 +54,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
int num_pages = 0;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
+ bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
+ GFP_NOFS);
while (len) {
unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -62,10 +63,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
if (num_pages == 0) {
fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector =
pblk << (blockbits - SECTOR_SHIFT);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
}
ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
if (WARN_ON(ret != bytes_this_page)) {
@@ -81,7 +80,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
num_pages = 0;
}
}
@@ -150,12 +149,10 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
return -EINVAL;
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
- bio = bio_alloc(GFP_NOFS, nr_pages);
+ bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
do {
- bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector = pblk << (blockbits - 9);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
i = 0;
offset = 0;
@@ -182,7 +179,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
err = submit_bio_wait(bio);
if (err)
goto out;
- bio_reset(bio);
+ bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
} while (len != 0);
err = 0;
out:
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 4ef3f714046a..4fcca79f39ae 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -69,6 +69,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
}
EXPORT_SYMBOL(fscrypt_free_bounce_page);
+/*
+ * Generate the IV for the given logical block number within the given file.
+ * For filenames encryption, lblk_num == 0.
+ *
+ * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks()
+ * needs to know about any IV generation methods where the low bits of IV don't
+ * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ */
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci)
{
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index c57bebfa48fe..93c2ca858092 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -17,6 +17,7 @@
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
+#include <linux/uio.h>
#include "fscrypt_private.h"
@@ -315,6 +316,10 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh);
*
* fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
*
+ * This function isn't required in cases where crypto-mergeability is ensured in
+ * another way, such as I/O targeting only a single file (and thus a single key)
+ * combined with fscrypt_limit_io_blocks() to ensure DUN contiguity.
+ *
* Return: true iff the I/O is mergeable
*/
bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
@@ -363,3 +368,91 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
return fscrypt_mergeable_bio(bio, inode, next_lblk);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
+
+/**
+ * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
+ * supported as far as encryption is concerned
+ * @iocb: the file and position the I/O is targeting
+ * @iter: the I/O data segment(s)
+ *
+ * Return: %true if there are no encryption constraints that prevent DIO from
+ * being supported; %false if DIO is unsupported. (Note that in the
+ * %true case, the filesystem might have other, non-encryption-related
+ * constraints that prevent DIO from actually being supported.)
+ */
+bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+{
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const unsigned int blocksize = i_blocksize(inode);
+
+ /* If the file is unencrypted, no veto from us. */
+ if (!fscrypt_needs_contents_encryption(inode))
+ return true;
+
+ /* We only support DIO with inline crypto, not fs-layer crypto. */
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return false;
+
+ /*
+ * Since the granularity of encryption is filesystem blocks, the file
+ * position and total I/O length must be aligned to the filesystem block
+ * size -- not just to the block device's logical block size as is
+ * traditionally the case for DIO on many filesystems.
+ *
+ * We require that the user-provided memory buffers be filesystem block
+ * aligned too. It is simpler to have a single alignment value required
+ * for all properties of the I/O, as is normally the case for DIO.
+ * Also, allowing less aligned buffers would imply that data units could
+ * cross bvecs, which would greatly complicate the I/O stack, which
+ * assumes that bios can be split at any bvec boundary.
+ */
+ if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
+
+/**
+ * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs
+ * @inode: the file on which I/O is being done
+ * @lblk: the block at which the I/O is being started from
+ * @nr_blocks: the number of blocks we want to submit starting at @lblk
+ *
+ * Determine the limit to the number of blocks that can be submitted in a bio
+ * targeting @lblk without causing a data unit number (DUN) discontiguity.
+ *
+ * This is normally just @nr_blocks, as normally the DUNs just increment along
+ * with the logical blocks. (Or the file is not encrypted.)
+ *
+ * In rare cases, fscrypt can be using an IV generation method that allows the
+ * DUN to wrap around within logically contiguous blocks, and that wraparound
+ * will occur. If this happens, a value less than @nr_blocks will be returned
+ * so that the wraparound doesn't occur in the middle of a bio, which would
+ * cause encryption/decryption to produce wrong results.
+ *
+ * Return: the actual number of blocks that can be submitted
+ */
+u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
+{
+ const struct fscrypt_info *ci;
+ u32 dun;
+
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return nr_blocks;
+
+ if (nr_blocks <= 1)
+ return nr_blocks;
+
+ ci = inode->i_crypt_info;
+ if (!(fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ return nr_blocks;
+
+ /* With IV_INO_LBLK_32, the DUN can wrap around from U32_MAX to 0. */
+
+ dun = ci->ci_hashed_ino + lblk;
+
+ return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun);
+}
+EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks);
diff --git a/fs/dax.c b/fs/dax.c
index cd03485867a7..ab0978739eaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -11,7 +11,6 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 654443558047..38bca4980a1c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -396,11 +396,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
* bio_alloc() is guaranteed to return a bio when allowed to sleep and
* we request a valid number of vectors.
*/
- bio = bio_alloc(GFP_KERNEL, nr_vecs);
-
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL);
bio->bi_iter.bi_sector = first_sector;
- bio_set_op_attrs(bio, dio->op, dio->op_flags);
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
else
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 226a57c57ee6..780db1e5f4b7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -28,10 +28,10 @@ void erofs_put_metabuf(struct erofs_buf *buf)
buf->page = NULL;
}
-void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
- struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *const mapping = inode->i_mapping;
erofs_off_t offset = blknr_to_addr(blkaddr);
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
@@ -60,6 +60,12 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
return buf->base + (offset & ~PAGE_MASK);
}
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
+}
+
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index eee9b0b31b63..18e59821c597 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "internal.h"
@@ -67,7 +68,7 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
static int erofs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
- struct address_space *mapping = dir->i_mapping;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
const size_t dirsize = i_size_read(dir);
unsigned int i = ctx->pos / EROFS_BLKSIZ;
unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
@@ -75,26 +76,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
bool initial = true;
while (ctx->pos < dirsize) {
- struct page *dentry_page;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- dentry_page = read_mapping_page(mapping, i, NULL);
- if (dentry_page == ERR_PTR(-ENOMEM)) {
- err = -ENOMEM;
- break;
- } else if (IS_ERR(dentry_page)) {
+ de = erofs_bread(&buf, dir, i, EROFS_KMAP);
+ if (IS_ERR(de)) {
erofs_err(dir->i_sb,
"fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
- err = -EFSCORRUPTED;
+ err = PTR_ERR(de);
break;
}
- de = (struct erofs_dirent *)kmap(dentry_page);
-
nameoff = le16_to_cpu(de->nameoff);
-
if (nameoff < sizeof(struct erofs_dirent) ||
nameoff >= PAGE_SIZE) {
erofs_err(dir->i_sb,
@@ -119,10 +113,6 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
err = erofs_fill_dentries(dir, ctx, de, &ofs,
nameoff, maxsize);
skip_this:
- kunmap(dentry_page);
-
- put_page(dentry_page);
-
ctx->pos = blknr_to_addr(i) + ofs;
if (err)
@@ -130,6 +120,7 @@ skip_this:
++i;
ofs = 0;
}
+ erofs_put_metabuf(&buf);
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 3ea62c6fb00a..1238ca104f09 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,6 +12,7 @@
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -186,8 +187,8 @@ struct erofs_inode_extended {
__le32 i_uid;
__le32 i_gid;
- __le64 i_ctime;
- __le32 i_ctime_nsec;
+ __le64 i_mtime;
+ __le32 i_mtime_nsec;
__le32 i_nlink;
__u8 i_reserved2[16];
};
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ff62f84f47d3..e8b37ba5e9ad 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -113,8 +113,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
+ inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
+ inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b8272fb95fd6..5298c4ee277d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -325,7 +325,7 @@ struct erofs_inode {
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
unsigned long z_tailextent_headlcn;
- unsigned int z_idataoff;
+ erofs_off_t z_idataoff;
unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
@@ -479,6 +479,8 @@ struct erofs_map_dev {
extern const struct file_operations erofs_file_fops;
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 8629e616028c..554efa363317 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "xattr.h"
@@ -86,14 +87,14 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
return ERR_PTR(-ENOENT);
}
-static struct page *find_target_block_classic(struct inode *dir,
- struct erofs_qstr *name,
- int *_ndirents)
+static void *find_target_block_classic(struct erofs_buf *target,
+ struct inode *dir,
+ struct erofs_qstr *name,
+ int *_ndirents)
{
unsigned int startprfx, endprfx;
int head, back;
- struct address_space *const mapping = dir->i_mapping;
- struct page *candidate = ERR_PTR(-ENOENT);
+ void *candidate = ERR_PTR(-ENOENT);
startprfx = endprfx = 0;
head = 0;
@@ -101,10 +102,11 @@ static struct page *find_target_block_classic(struct inode *dir,
while (head <= back) {
const int mid = head + (back - head) / 2;
- struct page *page = read_mapping_page(mapping, mid, NULL);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
- if (!IS_ERR(page)) {
- struct erofs_dirent *de = kmap_atomic(page);
+ de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
+ if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff,
EROFS_BLKSIZ);
const int ndirents = nameoff / sizeof(*de);
@@ -113,13 +115,12 @@ static struct page *find_target_block_classic(struct inode *dir,
struct erofs_qstr dname;
if (!ndirents) {
- kunmap_atomic(de);
- put_page(page);
+ erofs_put_metabuf(&buf);
erofs_err(dir->i_sb,
"corrupted dir block %d @ nid %llu",
mid, EROFS_I(dir)->nid);
DBG_BUGON(1);
- page = ERR_PTR(-EFSCORRUPTED);
+ de = ERR_PTR(-EFSCORRUPTED);
goto out;
}
@@ -135,7 +136,6 @@ static struct page *find_target_block_classic(struct inode *dir,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- kunmap_atomic(de);
if (!diff) {
*_ndirents = 0;
@@ -145,11 +145,12 @@ static struct page *find_target_block_classic(struct inode *dir,
startprfx = matched;
if (!IS_ERR(candidate))
- put_page(candidate);
- candidate = page;
+ erofs_put_metabuf(target);
+ *target = buf;
+ candidate = de;
*_ndirents = ndirents;
} else {
- put_page(page);
+ erofs_put_metabuf(&buf);
back = mid - 1;
endprfx = matched;
@@ -158,8 +159,8 @@ static struct page *find_target_block_classic(struct inode *dir,
}
out: /* free if the candidate is valid */
if (!IS_ERR(candidate))
- put_page(candidate);
- return page;
+ erofs_put_metabuf(target);
+ return de;
}
return candidate;
}
@@ -169,8 +170,7 @@ int erofs_namei(struct inode *dir,
erofs_nid_t *nid, unsigned int *d_type)
{
int ndirents;
- struct page *page;
- void *data;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
struct erofs_qstr qn;
@@ -181,26 +181,20 @@ int erofs_namei(struct inode *dir,
qn.end = name->name + name->len;
ndirents = 0;
- page = find_target_block_classic(dir, &qn, &ndirents);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ de = find_target_block_classic(&buf, dir, &qn, &ndirents);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
- data = kmap_atomic(page);
/* the target page has been mapped */
if (ndirents)
- de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
- else
- de = (struct erofs_dirent *)data;
+ de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
if (!IS_ERR(de)) {
*nid = le64_to_cpu(de->nid);
*d_type = de->file_type;
}
-
- kunmap_atomic(data);
- put_page(page);
-
+ erofs_put_metabuf(&buf);
return PTR_ERR_OR_ZERO(de);
}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 915eefe0d7e2..e178100c162a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -281,21 +281,19 @@ static int erofs_init_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
unsigned int blkszbits;
void *data;
int ret;
- page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
+ data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
- return PTR_ERR(page);
+ return PTR_ERR(data);
}
sbi = EROFS_SB(sb);
-
- data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -365,8 +363,7 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_sb_has_ztailpacking(sbi))
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
out:
- kunmap(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return ret;
}
@@ -535,6 +532,11 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * We could introduce an extra locking instead but it seems unnecessary.
+ */
static void erofs_managed_cache_invalidatepage(struct page *page,
unsigned int offset,
unsigned int length)
@@ -568,8 +570,7 @@ static int erofs_init_managed_cache(struct super_block *sb)
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping,
- GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->managed_cache = inode;
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index dac252bc9228..f3babf1e6608 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -221,9 +221,11 @@ void erofs_unregister_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- kobject_del(&sbi->s_kobj);
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ if (sbi->s_kobj.state_in_sysfs) {
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
}
int __init erofs_init_sysfs(void)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 423bc1a61da5..0ed880f42525 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -192,7 +192,10 @@ enum z_erofs_collectmode {
COLLECT_PRIMARY_FOLLOWED,
};
-struct z_erofs_collector {
+struct z_erofs_decompress_frontend {
+ struct inode *const inode;
+ struct erofs_map_blocks map;
+
struct z_erofs_pagevec_ctor vector;
struct z_erofs_pcluster *pcl, *tailpcl;
@@ -202,13 +205,6 @@ struct z_erofs_collector {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_collectmode mode;
-};
-
-struct z_erofs_decompress_frontend {
- struct inode *const inode;
-
- struct z_erofs_collector clt;
- struct erofs_map_blocks map;
bool readahead;
/* used for applying cache strategy on the fly */
@@ -216,30 +212,30 @@ struct z_erofs_decompress_frontend {
erofs_off_t headoffset;
};
-#define COLLECTOR_INIT() { \
- .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
-
#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .clt = COLLECTOR_INIT(), \
- .backmost = true, }
+ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = COLLECT_PRIMARY_FOLLOWED }
static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
static DEFINE_MUTEX(z_pagemap_global_lock);
-static void preload_compressed_pages(struct z_erofs_collector *clt,
- struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
+ enum z_erofs_cache_alloctype type,
+ struct page **pagepool)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
+ struct z_erofs_pcluster *pcl = fe->pcl;
bool standalone = true;
+ /*
+ * optimistic allocation without direct reclaim since inplace I/O
+ * can be used if low memory otherwise.
+ */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
struct page **pages;
pgoff_t index;
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED)
return;
pages = pcl->compressed_pages;
@@ -288,7 +284,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
* managed cache since it can be moved to the bypass queue instead.
*/
if (standalone)
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -350,47 +346,47 @@ int erofs_try_to_free_cached_page(struct page *page)
}
/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct page *page)
{
- struct z_erofs_pcluster *const pcl = clt->pcl;
+ struct z_erofs_pcluster *const pcl = fe->pcl;
- while (clt->icpage_ptr > pcl->compressed_pages)
- if (!cmpxchg(--clt->icpage_ptr, NULL, page))
+ while (fe->icpage_ptr > pcl->compressed_pages)
+ if (!cmpxchg(--fe->icpage_ptr, NULL, page))
return true;
return false;
}
/* callers must be with collection lock held */
-static int z_erofs_attach_page(struct z_erofs_collector *clt,
+static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct page *page, enum z_erofs_page_type type,
bool pvec_safereuse)
{
int ret;
/* give priority for inplaceio */
- if (clt->mode >= COLLECT_PRIMARY &&
+ if (fe->mode >= COLLECT_PRIMARY &&
type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(clt, page))
+ z_erofs_try_inplace_io(fe, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+ ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
pvec_safereuse);
- clt->cl->vcnt += (unsigned int)ret;
+ fe->cl->vcnt += (unsigned int)ret;
return ret ? 0 : -EAGAIN;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
- z_erofs_next_pcluster_t *owned_head = &clt->owned_head;
+ struct z_erofs_pcluster *pcl = f->pcl;
+ z_erofs_next_pcluster_t *owned_head = &f->owned_head;
/* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
*owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
/* so we can attach this pcluster to our submission chain. */
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ f->mode = COLLECT_PRIMARY_FOLLOWED;
return;
}
@@ -401,24 +397,24 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- clt->mode = COLLECT_PRIMARY_HOOKED;
- clt->tailpcl = NULL;
+ f->mode = COLLECT_PRIMARY_HOOKED;
+ f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
- clt->mode = COLLECT_PRIMARY;
+ f->mode = COLLECT_PRIMARY;
}
-static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
struct z_erofs_collection *cl;
unsigned int length;
/* to avoid unexpected loop formed by corrupted images */
- if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+ if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -449,15 +445,15 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
}
mutex_lock(&cl->lock);
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
- z_erofs_try_to_claim_pcluster(clt);
- clt->cl = cl;
+ z_erofs_try_to_claim_pcluster(fe);
+ fe->cl = cl;
return 0;
}
-static int z_erofs_register_collection(struct z_erofs_collector *clt,
+static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
@@ -485,8 +481,8 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
/* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = clt->owned_head;
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ pcl->next = fe->owned_head;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
cl->pageofs = map->m_la & ~PAGE_MASK;
@@ -512,18 +508,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
}
if (grp != &pcl->obj) {
- clt->pcl = container_of(grp,
+ fe->pcl = container_of(grp,
struct z_erofs_pcluster, obj);
err = -EEXIST;
goto err_out;
}
}
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
- clt->owned_head = &pcl->next;
- clt->pcl = pcl;
- clt->cl = cl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
+ fe->owned_head = &pcl->next;
+ fe->pcl = pcl;
+ fe->cl = cl;
return 0;
err_out:
@@ -532,18 +528,18 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
struct erofs_workgroup *grp;
int ret;
- DBG_BUGON(clt->cl);
+ DBG_BUGON(fe->cl);
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (map->m_flags & EROFS_MAP_META) {
if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
@@ -555,28 +551,28 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (grp) {
- clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
} else {
tailpacking:
- ret = z_erofs_register_collection(clt, inode, map);
+ ret = z_erofs_register_collection(fe, inode, map);
if (!ret)
goto out;
if (ret != -EEXIST)
return ret;
}
- ret = z_erofs_lookup_collection(clt, inode, map);
+ ret = z_erofs_lookup_collection(fe, inode, map);
if (ret) {
- erofs_workgroup_put(&clt->pcl->obj);
+ erofs_workgroup_put(&fe->pcl->obj);
return ret;
}
out:
- z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- clt->cl->pagevec, clt->cl->vcnt);
+ z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+ fe->cl->pagevec, fe->cl->vcnt);
/* since file-backed online pages are traversed in reverse order */
- clt->icpage_ptr = clt->pcl->compressed_pages +
- z_erofs_pclusterpages(clt->pcl);
+ fe->icpage_ptr = fe->pcl->compressed_pages +
+ z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -610,24 +606,24 @@ static void z_erofs_collection_put(struct z_erofs_collection *cl)
erofs_workgroup_put(&pcl->obj);
}
-static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = clt->cl;
+ struct z_erofs_collection *cl = fe->cl;
if (!cl)
return false;
- z_erofs_pagevec_ctor_exit(&clt->vector, false);
+ z_erofs_pagevec_ctor_exit(&fe->vector, false);
mutex_unlock(&cl->lock);
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
z_erofs_collection_put(cl);
- clt->cl = NULL;
+ fe->cl = NULL;
return true;
}
@@ -651,7 +647,6 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
- struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
bool tight = true;
@@ -672,7 +667,7 @@ repeat:
if (offset + cur >= map->m_la &&
offset + cur < map->m_la + map->m_llen) {
/* didn't get a valid collection previously (very rare) */
- if (!clt->cl)
+ if (!fe->cl)
goto restart_now;
goto hitted;
}
@@ -680,7 +675,7 @@ repeat:
/* go ahead the next map_blocks */
erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
- if (z_erofs_collector_end(clt))
+ if (z_erofs_collector_end(fe))
fe->backmost = false;
map->m_la = offset + cur;
@@ -693,11 +688,11 @@ restart_now:
if (!(map->m_flags & EROFS_MAP_MAPPED))
goto hitted;
- err = z_erofs_collector_begin(clt, inode, map);
+ err = z_erofs_collector_begin(fe, inode, map);
if (err)
goto err_out;
- if (z_erofs_is_inline_pcluster(clt->pcl)) {
+ if (z_erofs_is_inline_pcluster(fe->pcl)) {
void *mp;
mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
@@ -709,20 +704,18 @@ restart_now:
goto err_out;
}
get_page(fe->map.buf.page);
- WRITE_ONCE(clt->pcl->compressed_pages[0], fe->map.buf.page);
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page);
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
} else {
- /* preload all compressed pages (can change mode if needed) */
+ /* bind cache first when cached decompression is preferred */
if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ z_erofs_bind_cache(fe, cache_strategy, pagepool);
}
-
hitted:
/*
* Ensure the current partial page belongs to this submit chain rather
@@ -730,8 +723,8 @@ hitted:
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or pagevec (should be processed in strict order.)
*/
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
- clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED &&
+ fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -746,18 +739,18 @@ hitted:
Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
if (cur)
- tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type,
- clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ err = z_erofs_attach_page(fe, page, page_type,
+ fe->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
alloc_page(GFP_NOFS | __GFP_NOFAIL);
set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
- err = z_erofs_attach_page(clt, newpage,
+ err = z_erofs_attach_page(fe, newpage,
Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
@@ -773,7 +766,7 @@ retry:
/* bump up the number of spiltted parts of a page */
++spiltted;
/* also update nr_pages */
- clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+ fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1);
next_part:
/* can be used for verification */
map->m_llen = offset + cur - map->m_la;
@@ -1098,10 +1091,10 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
struct page **pagepool,
- struct address_space *mc,
- gfp_t gfp)
+ struct address_space *mc)
{
const pgoff_t index = pcl->obj.index;
+ gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
struct address_space *mapping;
@@ -1309,7 +1302,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
- z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
+ z_erofs_next_pcluster_t owned_head = f->owned_head;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
struct block_device *last_bdev;
@@ -1357,8 +1350,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct page *page;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi),
- GFP_NOFS);
+ MNGD_MAPPING(sbi));
if (!page)
continue;
@@ -1370,15 +1362,14 @@ submit_bio_retry:
}
if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, mdev.m_bdev);
last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
- bio->bi_opf = REQ_OP_READ;
if (f->readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
@@ -1417,7 +1408,7 @@ static void z_erofs_runqueue(struct super_block *sb,
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
@@ -1517,7 +1508,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
err = z_erofs_do_read_page(&f, page, &pagepool);
z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f.clt);
+ (void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
z_erofs_runqueue(inode->i_sb, &f, &pagepool,
@@ -1567,7 +1558,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
put_page(page);
}
z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f.clt);
+ (void)z_erofs_collector_end(&f);
z_erofs_runqueue(inode->i_sb, &f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, nr_pages));
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 361b1d6e4bf9..572f0b8151ba 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -431,48 +431,47 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn = m->lcn;
- int err;
- if (lcn < lookback_distance) {
- erofs_err(m->inode->i_sb,
- "bogus lookback distance @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
+ while (m->lcn >= lookback_distance) {
+ unsigned long lcn = m->lcn - lookback_distance;
+ int err;
- /* load extent head logical cluster if needed */
- lcn -= lookback_distance;
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
- if (err)
- return err;
+ /* load extent head logical cluster if needed */
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
- switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (!m->delta[0]) {
+ erofs_err(m->inode->i_sb,
+ "invalid lookback distance 0 @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lookback_distance = m->delta[0];
+ continue;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
+ default:
erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
+ "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
DBG_BUGON(1);
- return -EFSCORRUPTED;
+ return -EOPNOTSUPP;
}
- return z_erofs_extent_lookback(m, m->delta[0]);
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- m->headtype = m->type;
- map->m_la = (lcn << lclusterbits) | m->clusterofs;
- break;
- default:
- erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- return 0;
+
+ erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
@@ -494,7 +493,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1 << lclusterbits;
+ map->m_plen = 1ULL << lclusterbits;
return 0;
}
lcn = m->lcn + 1;
@@ -540,7 +539,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = m->compressedlcs << lclusterbits;
+ map->m_plen = (u64)m->compressedlcs << lclusterbits;
return 0;
err_bonus_cblkcnt:
erofs_err(m->inode->i_sb,
diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302..8256e8bb9ad3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -118,7 +118,7 @@ bool path_noexec(const struct path *path)
* Note that a shared library must be both readable and executable due to
* security reasons.
*
- * Also note that we take the address to load from from the file itself.
+ * Also note that we take the address to load from the file itself.
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
@@ -495,8 +495,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
+ *
+ * In the case of argc = 0, make sure there is space for adding a
+ * empty string (which will bump argc to 1), to ensure confused
+ * userspace programs don't start processing from argv[1], thinking
+ * argc can never be 0, to keep them from walking envp by accident.
+ * See do_execveat_common().
*/
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
@@ -536,7 +542,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
if (!valid_arg_len(bprm, len))
goto out;
- /* We're going to work our way backwords. */
+ /* We're going to work our way backwards. */
pos = bprm->p;
str += len;
bprm->p -= len;
@@ -1269,7 +1275,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Must be called _before_ exec_mmap() as bprm->mm is
- * not visibile until then. This also enables the update
+ * not visible until then. This also enables the update
* to be lockless.
*/
retval = set_mm_exe_file(bprm->mm, bprm->file);
@@ -1897,6 +1903,9 @@ static int do_execveat_common(int fd, struct filename *filename,
}
retval = count(argv, MAX_ARG_STRINGS);
+ if (retval == 0)
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+ current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -1923,6 +1932,19 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out_free;
+ /*
+ * When argv is empty, add an empty string ("") as argv[0] to
+ * ensure confused userspace programs that start processing
+ * from argv[1] won't end up walking envp. See also
+ * bprm_stack_limits().
+ */
+ if (bprm->argc == 0) {
+ retval = copy_string_kernel("", bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = 1;
+ }
+
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
@@ -1951,6 +1973,8 @@ int kernel_execve(const char *kernel_filename,
}
retval = count_strings_kernel(argv);
+ if (WARN_ON_ONCE(retval == 0))
+ retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8cc11715518a..8bd66cdc41be 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,9 +36,11 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct inode *inode)
+static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
{
- if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (!fscrypt_dio_supported(iocb, iter))
return false;
if (fsverity_active(inode))
return false;
@@ -61,7 +63,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -509,7 +511,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 26218088f63b..3d0ca48d20c8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3434,6 +3434,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
out:
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1d370364230e..17bb78ebd784 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -323,10 +323,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -372,10 +371,9 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
- int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0;
+ if (io->io_wbc->sync_mode == WB_SYNC_ALL)
+ io->io_bio->bi_opf |= REQ_SYNC;
io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
- bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
io->io_bio = NULL;
@@ -398,10 +396,9 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 4cd62f1d848c..1aa26d6634fc 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -365,15 +365,15 @@ int ext4_mpage_readpages(struct inode *inode,
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
- bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
+ bio = bio_alloc(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ, GFP_KERNEL);
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ,
- rac ? REQ_RAHEAD : 0);
+ if (rac)
+ bio->bi_opf |= REQ_RAHEAD;
}
length = first_hole << blkbits;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f46a7339d6cf..03ef087537c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -143,3 +143,10 @@ config F2FS_IOSTAT
Support getting IO statistics through sysfs and printing out periodic
IO statistics tracepoint events. You have to turn on "iostat_enable"
sysfs node to enable this feature.
+
+config F2FS_UNFAIR_RWSEM
+ bool "F2FS unfair rw_semaphore"
+ depends on F2FS_FS && BLK_CGROUP
+ help
+ Use unfair rw_semaphore, if system configured IO priority by block
+ cgroup.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 16e826e01f09..eaa240b21f07 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -204,8 +204,9 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu)
return __f2fs_get_acl(inode, type, NULL);
}
-static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
- struct posix_acl **acl)
+static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
+ struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
{
umode_t mode = inode->i_mode;
int error;
@@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
}
-static int __f2fs_set_acl(struct inode *inode, int type,
+static int __f2fs_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
int name_index;
@@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = f2fs_acl_update_mode(inode, &mode, &acl);
+ error = f2fs_acl_update_mode(mnt_userns, inode,
+ &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
@@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
- return __f2fs_set_acl(inode, type, acl, NULL);
+ return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL);
}
/*
@@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
- error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
} else {
@@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
}
if (acl) {
if (!error)
- error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
} else {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 982f0170639f..aba1b8a1ce66 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,6 +98,13 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
+ if (page->index == sbi->metapage_eio_ofs &&
+ sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) {
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->metapage_eio_ofs = page->index;
+ sbi->metapage_eio_cnt = 0;
+ }
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -282,18 +289,22 @@ out:
return blkno - start;
}
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks)
{
struct page *page;
bool readahead = false;
+ if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
+ return;
+
page = find_get_page(META_MAPPING(sbi), index);
if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
if (readahead)
- f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
+ f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}
static int __f2fs_write_meta_page(struct page *page,
@@ -351,13 +362,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
- if (!down_write_trylock(&sbi->cp_global_sem))
+ if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -864,6 +875,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -871,15 +883,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (err)
return NULL;
- if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
- sbi->blocks_per_seg) {
+ cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
+
+ if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@@ -1159,7 +1172,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
- if (!down_write_trylock(&sbi->quota_sem))
+ if (!f2fs_down_write_trylock(&sbi->quota_sem))
return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
@@ -1171,7 +1184,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
ret = true;
}
- up_write(&sbi->quota_sem);
+ f2fs_up_write(&sbi->quota_sem);
return ret;
}
@@ -1228,10 +1241,10 @@ retry_flush_dents:
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush. inode->i_blocks can be updated.
*/
- down_write(&sbi->node_change);
+ f2fs_down_write(&sbi->node_change);
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
@@ -1241,15 +1254,15 @@ retry_flush_dents:
}
retry_flush_nodes:
- down_write(&sbi->node_write);
+ f2fs_down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
atomic_inc(&sbi->wb_sync_req[NODE]);
err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
atomic_dec(&sbi->wb_sync_req[NODE]);
if (err) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
return err;
}
@@ -1262,13 +1275,13 @@ retry_flush_nodes:
* dirty node blocks and some checkpoint values by block allocation.
*/
__prepare_cp_block(sbi);
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
f2fs_unlock_all(sbi);
}
@@ -1543,6 +1556,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
+ percpu_counter_set(&sbi->rf_node_block_count, 0);
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
@@ -1612,7 +1626,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1693,7 +1707,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
if (cpc->reason != CP_RESIZE)
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
return err;
}
@@ -1741,9 +1755,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
struct cp_control cpc = { .reason = CP_SYNC, };
int err;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return err;
}
@@ -1831,9 +1845,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
int ret;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
ret = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return ret;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d0c3aeba5945..11e99bf6286c 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -314,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
+ printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
"expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
+ F2FS_I_SB(dic->inode)->sb->s_id, ret,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
@@ -1267,7 +1266,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
} else if (!f2fs_trylock_op(sbi)) {
goto out_free;
}
@@ -1384,7 +1383,7 @@ unlock_continue:
f2fs_put_dnode(&dn);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
@@ -1410,7 +1409,7 @@ out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
out_free:
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8c417864c66a..67d031589a7b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -354,7 +354,7 @@ static void f2fs_write_end_io(struct bio *bio)
}
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio)
+ block_t blk_addr, sector_t *sector)
{
struct block_device *bdev = sbi->sb->s_bdev;
int i;
@@ -369,10 +369,9 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
}
}
}
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
- }
+
+ if (sector)
+ *sector = SECTOR_FROM_BLOCK(blk_addr);
return bdev;
}
@@ -389,14 +388,40 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
+static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag)
+{
+ unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
+ unsigned int fua_flag = io_flag & temp_mask;
+ unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
+
+ /*
+ * data/node io flag bits per temp:
+ * REQ_META | REQ_FUA |
+ * 5 | 4 | 3 | 2 | 1 | 0 |
+ * Cold | Warm | Hot | Cold | Warm | Hot |
+ */
+ if ((1 << fio->temp) & meta_flag)
+ fio->op_flags |= REQ_META;
+ if ((1 << fio->temp) & fua_flag)
+ fio->op_flags |= REQ_FUA;
+}
+
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
+ struct block_device *bdev;
+ sector_t sector;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
+ if (fio->type == DATA)
+ __attach_io_flag(fio, sbi->data_io_flag);
+ else if (fio->type == NODE)
+ __attach_io_flag(fio, sbi->node_io_flag);
- f2fs_target_device(sbi, fio->new_blkaddr, bio);
+ bdev = f2fs_target_device(sbi, fio->new_blkaddr, &sector);
+ bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO,
+ &f2fs_bioset);
+ bio->bi_iter.bi_sector = sector;
if (is_read_io(fio->op)) {
bio->bi_end_io = f2fs_read_end_io;
bio->bi_private = NULL;
@@ -500,34 +525,6 @@ void f2fs_submit_bio(struct f2fs_sb_info *sbi,
__submit_bio(sbi, bio, type);
}
-static void __attach_io_flag(struct f2fs_io_info *fio)
-{
- struct f2fs_sb_info *sbi = fio->sbi;
- unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
- unsigned int io_flag, fua_flag, meta_flag;
-
- if (fio->type == DATA)
- io_flag = sbi->data_io_flag;
- else if (fio->type == NODE)
- io_flag = sbi->node_io_flag;
- else
- return;
-
- fua_flag = io_flag & temp_mask;
- meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
-
- /*
- * data/node io flag bits per temp:
- * REQ_META | REQ_FUA |
- * 5 | 4 | 3 | 2 | 1 | 0 |
- * Cold | Warm | Hot | Cold | Warm | Hot |
- */
- if ((1 << fio->temp) & meta_flag)
- fio->op_flags |= REQ_META;
- if ((1 << fio->temp) & fua_flag)
- fio->op_flags |= REQ_FUA;
-}
-
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -535,9 +532,6 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
- __attach_io_flag(fio);
- bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
-
if (is_read_io(fio->op))
trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
else
@@ -590,18 +584,17 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
- io->fio.op = REQ_OP_WRITE;
- io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
+ io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC;
if (!test_opt(sbi, NOBARRIER))
- io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
+ io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA;
}
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
@@ -616,9 +609,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_read(&io->io_rwsem);
+ f2fs_down_read(&io->io_rwsem);
ret = __has_merged_page(io->bio, inode, page, ino);
- up_read(&io->io_rwsem);
+ f2fs_up_read(&io->io_rwsem);
}
if (ret)
__f2fs_submit_merged_write(sbi, type, temp);
@@ -679,9 +672,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (fio->io_wbc && !is_read_io(fio->op))
wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
- __attach_io_flag(fio);
- bio_set_op_attrs(bio, fio->op, fio->op_flags);
-
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page): WB_DATA_TYPE(fio->page));
@@ -742,9 +732,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
f2fs_bug_on(sbi, 1);
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_add_tail(&be->list, &io->bio_list);
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
static void del_bio_entry(struct bio_entry *be)
@@ -766,7 +756,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct list_head *head = &io->bio_list;
struct bio_entry *be;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (be->bio != *bio)
continue;
@@ -790,7 +780,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
__submit_bio(sbi, *bio, DATA);
break;
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (ret) {
@@ -816,7 +806,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (list_empty(head))
continue;
- down_read(&io->bio_list_lock);
+ f2fs_down_read(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -826,14 +816,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (found)
break;
}
- up_read(&io->bio_list_lock);
+ f2fs_up_read(&io->bio_list_lock);
if (!found)
continue;
found = false;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -846,7 +836,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
break;
}
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (found)
@@ -875,10 +865,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_VECS);
- __attach_io_flag(fio);
f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
fio->page->index, fio, GFP_NOIO);
- bio_set_op_attrs(bio, fio->op, fio->op_flags);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
@@ -906,7 +894,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
f2fs_bug_on(sbi, is_read_io(fio->op));
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
next:
if (fio->in_list) {
spin_lock(&io->io_lock);
@@ -973,7 +961,7 @@ out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
@@ -984,17 +972,17 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
struct bio *bio;
struct bio_post_read_ctx *ctx = NULL;
unsigned int post_read_steps = 0;
+ sector_t sector;
+ struct block_device *bdev = f2fs_target_device(sbi, blkaddr, &sector);
- bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
- bio_max_segs(nr_pages), &f2fs_bioset);
+ bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages),
+ REQ_OP_READ | op_flag,
+ for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
-
+ bio->bi_iter.bi_sector = sector;
f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
-
- f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
- bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= STEP_DECRYPT;
@@ -1383,9 +1371,9 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
- down_read(&sbi->node_change);
+ f2fs_down_read(&sbi->node_change);
else
- up_read(&sbi->node_change);
+ f2fs_up_read(&sbi->node_change);
} else {
if (lock)
f2fs_lock_op(sbi);
@@ -2460,6 +2448,9 @@ static inline bool check_inplace_update_policy(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
+ if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) &&
+ is_inode_flag_set(inode, FI_OPU_WRITE))
+ return false;
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
@@ -2530,6 +2521,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return true;
+ if (is_inode_flag_set(inode, FI_OPU_WRITE))
+ return true;
+
if (fio) {
if (page_private_gcing(fio->page))
return true;
@@ -2749,13 +2743,13 @@ write:
* the below discard race condition.
*/
if (IS_NOQUOTA(inode))
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto done;
}
@@ -3154,8 +3148,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
- /* skip writing during file defragment */
- if (is_inode_flag_set(inode, FI_DO_DEFRAG))
+ /* skip writing in file defragment preparing stage */
+ if (is_inode_flag_set(inode, FI_SKIP_WRITES))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, DATA);
@@ -3163,8 +3157,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
- else if (atomic_read(&sbi->wb_sync_req[DATA]))
+ else if (atomic_read(&sbi->wb_sync_req[DATA])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
@@ -3213,14 +3211,14 @@ void f2fs_write_failed(struct inode *inode, loff_t to)
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache(inode, i_size);
f2fs_truncate_blocks(inode, i_size, true);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -3353,7 +3351,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
*fsdata = NULL;
- if (len == PAGE_SIZE)
+ if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode)))
goto repeat;
ret = f2fs_prepare_compress_overwrite(inode, pagep,
@@ -3721,19 +3719,20 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
unsigned int end_sec = secidx + blkcnt / blk_per_sec;
int ret = 0;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
+ set_inode_flag(inode, FI_OPU_WRITE);
for (; secidx < end_sec; secidx++) {
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
f2fs_unlock_op(sbi);
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
struct page *page;
@@ -3741,7 +3740,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
page = f2fs_get_lock_data_page(inode, blkidx, true);
if (IS_ERR(page)) {
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
ret = PTR_ERR(page);
goto done;
}
@@ -3750,22 +3749,23 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
f2fs_put_page(page, 1);
}
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
ret = filemap_fdatawrite(inode->i_mapping);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
if (ret)
break;
}
done:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
+ clear_inode_flag(inode, FI_OPU_WRITE);
clear_inode_flag(inode, FI_ALIGNED_WRITE);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -4044,6 +4044,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->offset = blks_to_bytes(inode, map.m_lblk);
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) {
iomap->length = blks_to_bytes(inode, map.m_len);
if (map.m_flags & F2FS_MAP_MAPPED) {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8c50518475a9..fcdf253cd211 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static DEFINE_MUTEX(f2fs_stat_mutex);
+static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@@ -338,14 +338,16 @@ static char *s_flag[] = {
[SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
[SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
[SBI_IS_RESIZEFS] = " resizefs",
+ [SBI_IS_FREEZING] = " freezefs",
};
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0, j = 0;
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
@@ -474,12 +476,14 @@ static int stat_show(struct seq_file *s, void *v)
si->node_segs, si->bg_node_segs);
seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
"Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Low (%d)\n",
+ "Urgent High (%d), Urgent Mid (%d), "
+ "Urgent Low (%d)\n",
si->sbi->gc_reclaimed_segs[GC_NORMAL],
si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
+ si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
@@ -532,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
+ seq_printf(s, " - fsync mark: %4lld\n",
+ percpu_counter_sum_positive(
+ &si->sbi->rf_node_block_count));
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
@@ -573,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -584,6 +591,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@@ -619,9 +627,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
atomic_set(&sbi->max_vw_cnt, 0);
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -629,10 +637,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_del(&si->stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
kfree(si);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 166f08623362..a0e51937d92e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -766,7 +766,7 @@ add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -793,7 +793,7 @@ add_dentry:
f2fs_update_parent_metadata(dir, inode, current_depth);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
f2fs_put_page(dentry_page, 1);
@@ -858,7 +858,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
struct page *page;
int err = 0;
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -869,7 +869,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(inode, FI_NEW_INODE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return err;
}
@@ -877,7 +877,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
@@ -888,7 +888,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
f2fs_i_links_write(inode, false);
f2fs_i_size_write(inode, 0);
}
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
if (inode->i_nlink == 0)
f2fs_add_orphan_inode(inode);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 68b44015514f..9c72a91912ae 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -123,6 +123,20 @@ typedef u32 nid_t;
#define COMPRESS_EXT_NUM 16
+/*
+ * An implementation of an rwsem that is explicitly unfair to readers. This
+ * prevents priority inversion when a low-priority reader acquires the read lock
+ * while sleeping on the write lock but the write lock is needed by
+ * higher-priority clients.
+ */
+
+struct f2fs_rwsem {
+ struct rw_semaphore internal_rwsem;
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_queue_head_t read_waiters;
+#endif
+};
+
struct f2fs_mount_info {
unsigned int opt;
int write_io_size_bits; /* Write IO size bits */
@@ -386,6 +400,10 @@ struct discard_cmd_control {
struct mutex cmd_lock;
unsigned int nr_discards; /* # of discards in the list */
unsigned int max_discards; /* max. discards to be issued */
+ unsigned int max_discard_request; /* max. discard request per round */
+ unsigned int min_discard_issue_time; /* min. interval between discard issue */
+ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
+ unsigned int max_discard_issue_time; /* max. interval between discard issue */
unsigned int discard_granularity; /* discard granularity */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
@@ -561,6 +579,9 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+/* maximum retry of EIO'ed meta page */
+#define MAX_RETRY_META_PAGE_EIO 100
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
@@ -574,6 +595,9 @@ enum {
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
@@ -721,7 +745,8 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_SKIP_WRITES, /* should skip data page writeback */
+ FI_OPU_WRITE, /* used for opu per file */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */
FI_HOT_DATA, /* indicate file is hot */
@@ -752,7 +777,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
- struct rw_semaphore i_sem; /* protect fi info */
+ struct f2fs_rwsem i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -777,8 +802,8 @@ struct f2fs_inode_info {
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
- struct rw_semaphore i_gc_rwsem[2];
- struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+ struct f2fs_rwsem i_gc_rwsem[2];
+ struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
int i_extra_isize; /* size of extra space located in i_addr */
kprojid_t i_projid; /* id for project quota */
@@ -897,6 +922,7 @@ struct f2fs_nm_info {
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
+ nid_t max_rf_node_blocks; /* max # of nodes for recovery */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
@@ -904,7 +930,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat entry tree */
+ struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
struct list_head nat_entries; /* cached nat entry list (clean) */
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
@@ -1017,7 +1043,7 @@ struct f2fs_sm_info {
struct dirty_seglist_info *dirty_info; /* dirty segment information */
struct curseg_info *curseg_array; /* active segment information */
- struct rw_semaphore curseg_lock; /* for preventing curseg change */
+ struct f2fs_rwsem curseg_lock; /* for preventing curseg change */
block_t seg0_blkaddr; /* block address of 0'th segment */
block_t main_blkaddr; /* start block address of main area */
@@ -1201,11 +1227,11 @@ struct f2fs_bio_info {
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
- struct rw_semaphore io_rwsem; /* blocking op for bio */
+ struct f2fs_rwsem io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
struct list_head bio_list; /* bio entry list head */
- struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */
+ struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */
};
#define FDEV(i) (sbi->devs[i])
@@ -1267,6 +1293,7 @@ enum {
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
SBI_IS_RESIZEFS, /* resizefs is in process */
+ SBI_IS_FREEZING, /* freezefs is in process */
};
enum {
@@ -1286,6 +1313,7 @@ enum {
GC_IDLE_AT,
GC_URGENT_HIGH,
GC_URGENT_LOW,
+ GC_URGENT_MID,
MAX_GC_MODE,
};
@@ -1571,7 +1599,7 @@ struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
- struct rw_semaphore sb_lock; /* lock for raw super block */
+ struct f2fs_rwsem sb_lock; /* lock for raw super block */
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
struct mutex writepages; /* mutex for writepages() */
@@ -1591,18 +1619,20 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
/* keep migration IO order for LFS mode */
- struct rw_semaphore io_order_lock;
+ struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
+ pgoff_t metapage_eio_ofs; /* EIO page offset */
+ int metapage_eio_cnt; /* EIO count */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
- struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */
- struct rw_semaphore cp_rwsem; /* blocking FS operations */
- struct rw_semaphore node_write; /* locking node writes */
- struct rw_semaphore node_change; /* locking node change */
+ struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */
+ struct f2fs_rwsem cp_rwsem; /* blocking FS operations */
+ struct f2fs_rwsem node_write; /* locking node writes */
+ struct f2fs_rwsem node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -1662,12 +1692,14 @@ struct f2fs_sb_info {
block_t unusable_block_count; /* # of blocks saved by last cp */
unsigned int nquota_files; /* # of quota sysfile */
- struct rw_semaphore quota_sem; /* blocking cp for flags */
+ struct f2fs_rwsem quota_sem; /* blocking cp for flags */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* # of node block writes as roll forward recovery */
+ struct percpu_counter rf_node_block_count;
/* writeback control */
atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */
@@ -1678,7 +1710,7 @@ struct f2fs_sb_info {
struct f2fs_mount_info mount_opt; /* mount options */
/* for cleaning operations */
- struct rw_semaphore gc_lock; /*
+ struct f2fs_rwsem gc_lock; /*
* semaphore for GC, avoid
* race between GC and GC or CP
*/
@@ -1698,7 +1730,7 @@ struct f2fs_sb_info {
/* threshold for gc trials on pinned files */
u64 gc_pin_file_threshold;
- struct rw_semaphore pin_sem;
+ struct f2fs_rwsem pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -2092,9 +2124,81 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
+#define init_f2fs_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __init_f2fs_rwsem((sem), #sem, &__key); \
+} while (0)
+
+static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
+ const char *sem_name, struct lock_class_key *key)
+{
+ __init_rwsem(&sem->internal_rwsem, sem_name, key);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ init_waitqueue_head(&sem->read_waiters);
+#endif
+}
+
+static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_locked(&sem->internal_rwsem);
+}
+
+static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_contended(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_read(struct f2fs_rwsem *sem)
+{
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem));
+#else
+ down_read(&sem->internal_rwsem);
+#endif
+}
+
+static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem)
+{
+ return down_read_trylock(&sem->internal_rwsem);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass)
+{
+ down_read_nested(&sem->internal_rwsem, subclass);
+}
+#else
+#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem)
+#endif
+
+static inline void f2fs_up_read(struct f2fs_rwsem *sem)
+{
+ up_read(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_write(struct f2fs_rwsem *sem)
+{
+ down_write(&sem->internal_rwsem);
+}
+
+static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem)
+{
+ return down_write_trylock(&sem->internal_rwsem);
+}
+
+static inline void f2fs_up_write(struct f2fs_rwsem *sem)
+{
+ up_write(&sem->internal_rwsem);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wake_up_all(&sem->read_waiters);
+#endif
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- down_read(&sbi->cp_rwsem);
+ f2fs_down_read(&sbi->cp_rwsem);
}
static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
@@ -2103,22 +2207,22 @@ static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
f2fs_show_injection_info(sbi, FAULT_LOCK_OP);
return 0;
}
- return down_read_trylock(&sbi->cp_rwsem);
+ return f2fs_down_read_trylock(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- up_read(&sbi->cp_rwsem);
+ f2fs_up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- down_write(&sbi->cp_rwsem);
+ f2fs_down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->cp_rwsem);
+ f2fs_up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
@@ -2681,6 +2785,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
if (is_inflight_io(sbi, type))
return false;
+ if (sbi->gc_mode == GC_URGENT_MID)
+ return true;
+
if (sbi->gc_mode == GC_URGENT_LOW &&
(type == DISCARD_TIME || type == GC_TIME))
return true;
@@ -3579,7 +3686,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks);
long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write, enum iostat_type io_type);
void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
@@ -3631,7 +3739,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio);
int f2fs_merge_page_bio(struct f2fs_io_info *fio);
void f2fs_submit_page_write(struct f2fs_io_info *fio);
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio);
+ block_t blk_addr, sector_t *sector);
int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
@@ -4371,7 +4479,11 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- if (f2fs_post_read_required(inode))
+ if (!fscrypt_dio_supported(iocb, iter))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
return true;
/* disallow direct IO if any of devices has unaligned blksize */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c98ef6af97d..68ddf4c7ca64 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -237,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
nid_t pino;
- down_write(&fi->i_sem);
+ f2fs_down_write(&fi->i_sem);
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
f2fs_i_pino_write(inode, pino);
file_got_pino(inode);
}
- up_write(&fi->i_sem);
+ f2fs_up_write(&fi->i_sem);
}
static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
@@ -318,9 +318,9 @@ go_write:
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&F2FS_I(inode)->i_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
cp_reason = need_do_checkpoint(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
if (cp_reason) {
/* all the dirty node pages should be flushed for POR */
@@ -812,7 +812,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_inode *ri;
+ struct f2fs_inode *ri = NULL;
unsigned int flags;
if (f2fs_has_extra_attr(inode) &&
@@ -844,7 +844,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@ -904,7 +904,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -958,7 +958,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, attr->ia_size);
@@ -970,7 +970,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* larger than i_size.
*/
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -980,10 +980,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
- __setattr_copy(&init_user_ns, inode, attr);
+ __setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -1112,7 +1112,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_start = (loff_t)pg_start << PAGE_SHIFT;
blk_end = (loff_t)pg_end << PAGE_SHIFT;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache_range(inode, blk_start, blk_end - 1);
@@ -1122,7 +1122,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1355,7 +1355,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
f2fs_lock_op(sbi);
@@ -1365,7 +1365,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -1500,7 +1500,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
unsigned int end_offset;
pgoff_t end;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache_range(inode,
@@ -1514,7 +1514,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret) {
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1526,7 +1526,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_balance_fs(sbi, dn.node_changed);
@@ -1600,7 +1600,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache(inode, offset);
@@ -1618,7 +1618,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
}
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
filemap_invalidate_lock(mapping);
@@ -1674,13 +1674,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
}
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
@@ -1690,7 +1690,7 @@ next_alloc:
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
file_dont_truncate(inode);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
expanded += map.m_len;
sec_len -= map.m_len;
@@ -1989,11 +1989,12 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2008,7 +2009,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode_lock(inode);
- f2fs_disable_compressed_file(inode);
+ if (!f2fs_disable_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (f2fs_is_atomic_file(inode)) {
if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
@@ -2020,7 +2024,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (ret)
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
@@ -2031,7 +2035,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -2044,7 +2048,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
/* add inode in inmem_list first and set atomic_file */
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
F2FS_I(inode)->inmem_task = current;
@@ -2058,9 +2062,10 @@ out:
static int f2fs_ioc_commit_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2100,9 +2105,10 @@ err_out:
static int f2fs_ioc_start_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2135,9 +2141,10 @@ out:
static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2164,9 +2171,10 @@ out:
static int f2fs_ioc_abort_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2351,7 +2359,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
if (err)
return err;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
goto got_it;
@@ -2370,7 +2378,7 @@ got_it:
16))
err = -EFAULT;
out_err:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
return err;
}
@@ -2447,12 +2455,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
return ret;
if (!sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
@@ -2483,12 +2491,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
do_more:
if (!range->sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, range->sync, true, false,
@@ -2559,10 +2567,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
bool fragmented = false;
int err;
- /* if in-place-update policy is enabled, don't waste time here */
- if (f2fs_should_update_inplace(inode, NULL))
- return -EINVAL;
-
pg_start = range->start >> PAGE_SHIFT;
pg_end = (range->start + range->len) >> PAGE_SHIFT;
@@ -2570,6 +2574,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
inode_lock(inode);
+ /* if in-place-update policy is enabled, don't waste time here */
+ set_inode_flag(inode, FI_OPU_WRITE);
+ if (f2fs_should_update_inplace(inode, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+
/* writeback all dirty pages in the range */
err = filemap_write_and_wait_range(inode->i_mapping, range->start,
range->start + range->len - 1);
@@ -2651,7 +2662,7 @@ do_map:
goto check;
}
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
@@ -2676,15 +2687,16 @@ check:
if (map.m_lblk < pg_end && cnt < blk_per_seg)
goto do_map;
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
goto out;
}
clear_out:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
out:
+ clear_inode_flag(inode, FI_OPU_WRITE);
inode_unlock(inode);
if (!err)
range->len = (u64)total << PAGE_SHIFT;
@@ -2820,10 +2832,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_balance_fs(sbi, true);
- down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
if (src != dst) {
ret = -EBUSY;
- if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
+ if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
goto out_src;
}
@@ -2841,9 +2853,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_unlock_op(sbi);
if (src != dst)
- up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
out_src:
- up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
out_unlock:
if (src != dst)
inode_unlock(dst);
@@ -2938,7 +2950,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
end_segno = min(start_segno + range.segments, dev_end_segno);
while (start_segno < end_segno) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
@@ -2990,7 +3002,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *ipage;
+ struct f2fs_inode *ri = NULL;
kprojid_t kprojid;
int err;
@@ -3014,17 +3026,8 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (IS_NOQUOTA(inode))
return err;
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
-
- if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize,
- i_projid)) {
- err = -EOVERFLOW;
- f2fs_put_page(ipage, 1);
- return err;
- }
- f2fs_put_page(ipage, 1);
+ if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+ return -EOVERFLOW;
err = f2fs_dquot_initialize(inode);
if (err)
@@ -3215,9 +3218,9 @@ int f2fs_precache_extents(struct inode *inode)
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -3294,11 +3297,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
if (!vbuf)
return -ENOMEM;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
count = utf16s_to_utf8s(sbi->raw_super->volume_name,
ARRAY_SIZE(sbi->raw_super->volume_name),
UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME);
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (copy_to_user((char __user *)arg, vbuf,
min(FSLABEL_MAX, count)))
@@ -3326,7 +3329,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
if (err)
goto out;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
memset(sbi->raw_super->volume_name, 0,
sizeof(sbi->raw_super->volume_name));
@@ -3336,7 +3339,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
err = f2fs_commit_super(sbi, false);
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
out:
@@ -3462,7 +3465,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (!atomic_read(&F2FS_I(inode)->i_compr_blocks))
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3499,7 +3502,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
out:
inode_unlock(inode);
@@ -3615,7 +3618,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
goto unlock_inode;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3652,7 +3655,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3770,7 +3773,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
if (ret)
goto err;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
ret = filemap_write_and_wait_range(mapping, range.start,
@@ -3859,7 +3862,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
prev_block, len, range.flags);
out:
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
err:
inode_unlock(inode);
file_end_write(filp);
@@ -4291,12 +4294,12 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
trace_f2fs_direct_IO_enter(inode, iocb, count, READ);
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
ret = -EAGAIN;
goto out;
}
} else {
- down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
/*
@@ -4315,7 +4318,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = iomap_dio_complete(dio);
}
- up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
file_accessed(file);
out:
@@ -4497,12 +4500,12 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
goto out;
}
- if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
ret = -EAGAIN;
goto out;
}
- if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
- up_read(&fi->i_gc_rwsem[WRITE]);
+ if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
ret = -EAGAIN;
goto out;
}
@@ -4511,9 +4514,9 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (ret)
goto out;
- down_read(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
if (do_opu)
- down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
if (whint_mode == WHINT_MODE_OFF)
iocb->ki_hint = WRITE_LIFE_NOT_SET;
@@ -4542,8 +4545,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (whint_mode == WHINT_MODE_OFF)
iocb->ki_hint = hint;
if (do_opu)
- up_read(&fi->i_gc_rwsem[READ]);
- up_read(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
if (ret < 0)
goto out;
@@ -4644,12 +4647,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
if (!f2fs_truncate(inode))
file_dont_truncate(inode);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
} else {
file_dont_truncate(inode);
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ee308a8de432..ea5b93b689cd 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -103,23 +103,26 @@ static int gc_thread_func(void *data)
sbi->gc_urgent_high_remaining--;
}
spin_unlock(&sbi->gc_urgent_high_lock);
+ }
+ if (sbi->gc_mode == GC_URGENT_HIGH ||
+ sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
}
if (foreground) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
- } else if (!down_write_trylock(&sbi->gc_lock)) {
+ } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
stat_io_skip_bggc_count(sbi);
goto next;
}
@@ -1038,8 +1041,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- if (f2fs_check_nid_range(sbi, dni->ino))
+ if (f2fs_check_nid_range(sbi, dni->ino)) {
+ f2fs_put_page(node_page, 1);
return false;
+ }
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
@@ -1230,7 +1235,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
- down_write(&fio.sbi->io_order_lock);
+ f2fs_down_write(&fio.sbi->io_order_lock);
mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
fio.old_blkaddr, false);
@@ -1316,7 +1321,7 @@ recover_block:
true, true, true);
up_out:
if (lfs_mode)
- up_write(&fio.sbi->io_order_lock);
+ f2fs_up_write(&fio.sbi->io_order_lock);
put_out:
f2fs_put_dnode(&dn);
out:
@@ -1475,7 +1480,7 @@ next_step:
special_file(inode->i_mode))
continue;
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
sbi->skipped_gc_rwsem++;
@@ -1488,7 +1493,7 @@ next_step:
if (f2fs_post_read_required(inode)) {
int err = ra_data_block(inode, start_bidx);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err) {
iput(inode);
continue;
@@ -1499,7 +1504,7 @@ next_step:
data_page = f2fs_get_read_data_page(inode,
start_bidx, REQ_RAHEAD, true);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -1518,14 +1523,14 @@ next_step:
int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) {
sbi->skipped_gc_rwsem++;
continue;
}
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
continue;
}
locked = true;
@@ -1548,8 +1553,8 @@ next_step:
submitted++;
if (locked) {
- up_write(&fi->i_gc_rwsem[WRITE]);
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
}
stat_inc_data_blk_count(sbi, 1, gc_type);
@@ -1807,7 +1812,7 @@ stop:
reserved_segments(sbi),
prefree_segments(sbi));
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
@@ -1936,7 +1941,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
long long block_count;
int segs = secs * sbi->segs_per_sec;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
section_count = le32_to_cpu(raw_sb->section_count);
segment_count = le32_to_cpu(raw_sb->segment_count);
@@ -1957,7 +1962,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
cpu_to_le32(dev_segs + segs);
}
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -2031,7 +2036,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
/* stop other GC */
- if (!down_write_trylock(&sbi->gc_lock))
+ if (!f2fs_down_write_trylock(&sbi->gc_lock))
return -EAGAIN;
/* stop CP to protect MAIN_SEC in free_segment_range */
@@ -2051,15 +2056,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
out_unlock:
f2fs_unlock_op(sbi);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
return err;
set_sbi_flag(sbi, SBI_IS_RESIZEFS);
freeze_super(sbi->sb);
- down_write(&sbi->gc_lock);
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->cp_global_sem);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2104,8 +2109,8 @@ recover_out:
spin_unlock(&sbi->stat_lock);
}
out_err:
- up_write(&sbi->cp_global_sem);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 4b5cefa3f90c..a578bf83b803 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -629,7 +629,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
}
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -658,7 +658,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
f2fs_update_parent_metadata(dir, inode, 0);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
out:
f2fs_put_page(ipage, 1);
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0ec8e32a00b4..71f232dcf3c2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -778,7 +778,8 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
- sb_start_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@@ -809,7 +810,8 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
- sb_end_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
@@ -885,6 +887,7 @@ void f2fs_handle_failed_inode(struct inode *inode)
err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 5f213f05556d..5ed79b29999f 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,7 +22,8 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
@@ -46,7 +47,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_free = true;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
@@ -67,7 +68,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
else
- F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
F2FS_DEF_PROJID);
err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
@@ -196,7 +197,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
int i, cold_count, hot_count;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
@@ -206,7 +207,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
break;
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (i == cold_count + hot_count)
return;
@@ -299,19 +300,19 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
(!ext_cnt && !noext_cnt))
return;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
for (i = cold_count; i < cold_count + hot_count; i++) {
if (is_extension_exist(name, extlist[i], false)) {
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
return;
}
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
for (i = 0; i < noext_cnt; i++) {
if (is_extension_exist(name, noext[i], false)) {
@@ -349,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -679,7 +680,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -750,7 +751,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -807,7 +808,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -834,8 +835,9 @@ out:
return err;
}
-static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct inode **whiteout)
+static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
+ struct inode **whiteout)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -845,7 +847,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -909,20 +911,22 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(dir, dentry, mode, NULL);
+ return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL);
}
-static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
+ struct inode *dir, struct inode **whiteout)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
return -EIO;
- return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+ return __f2fs_tmpfile(mnt_userns, dir, NULL,
+ S_IFCHR | WHITEOUT_MODE, whiteout);
}
-static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = d_inode(old_dentry);
@@ -960,7 +964,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(old_dir, &whiteout);
+ err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout);
if (err)
return err;
}
@@ -1023,11 +1027,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_page = NULL;
new_inode->i_ctime = current_time(new_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
if (!new_inode->i_nlink)
f2fs_add_orphan_inode(new_inode);
@@ -1048,13 +1052,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, true);
}
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
@@ -1107,8 +1111,7 @@ out_dir:
out_old:
f2fs_put_page(old_page, 0);
out:
- if (whiteout)
- iput(whiteout);
+ iput(whiteout);
return err;
}
@@ -1214,38 +1217,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
/* update directory entry info of old dir inode */
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
if (old_nlink) {
- down_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
- up_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(old_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (!new_dir_entry)
file_lost_pino(new_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(new_inode, old_dir->i_ino);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
if (new_nlink) {
- down_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
- up_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(new_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(new_dir, false);
@@ -1300,7 +1303,8 @@ static int f2fs_rename2(struct user_namespace *mnt_userns,
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return f2fs_rename(mnt_userns, old_dir, old_dentry,
+ new_dir, new_dentry, flags);
}
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 50b2874e758c..fe8c38c9ac77 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -382,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -399,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -413,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -431,14 +431,14 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct nat_entry *new, *e;
/* Let's mitigate lock contention of nat_tree_lock during checkpoint */
- if (rwsem_is_locked(&sbi->cp_global_sem))
+ if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
return;
new = __alloc_nat_entry(sbi, nid, false);
if (!new)
return;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e)
e = __init_nat_entry(nm_i, new, ne, false);
@@ -447,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
nat_get_blkaddr(e) !=
le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
if (e != new)
__free_nat_entry(new);
}
@@ -459,7 +459,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = __init_nat_entry(nm_i, new, NULL, true);
@@ -508,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -516,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- if (!down_write_trylock(&nm_i->nat_tree_lock))
+ if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
return 0;
spin_lock(&nm_i->nat_list_lock);
@@ -538,7 +538,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
spin_unlock(&nm_i->nat_list_lock);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -560,13 +560,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
ni->nid = nid;
retry:
/* Check nat cache */
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return 0;
}
@@ -576,11 +576,11 @@ retry:
* nat_tree_lock. Therefore, we should retry, if we failed to grab here
* while not bothering checkpoint.
*/
- if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
+ if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
down_read(&curseg->journal_rwsem);
- } else if (rwsem_is_contended(&nm_i->nat_tree_lock) ||
+ } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
!down_read_trylock(&curseg->journal_rwsem)) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto retry;
}
@@ -589,15 +589,15 @@ retry:
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ up_read(&curseg->journal_rwsem);
if (i >= 0) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
}
/* Fill node_info from nat page */
index = current_nat_addr(sbi, nid);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
page = f2fs_get_meta_page(sbi, index);
if (IS_ERR(page))
@@ -1609,17 +1609,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
goto redirty_out;
if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
+ if (!f2fs_down_read_trylock(&sbi->node_write))
goto redirty_out;
} else {
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
}
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
@@ -1627,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (__is_valid_data_blkaddr(ni.blk_addr) &&
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
DATA_GENERIC_ENHANCE)) {
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto redirty_out;
}
@@ -1648,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
@@ -1782,6 +1782,7 @@ continue_unlock:
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
+ percpu_counter_inc(&sbi->rf_node_block_count);
if (IS_INODE(page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
@@ -2111,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
- else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ else if (atomic_read(&sbi->wb_sync_req[NODE])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -2225,14 +2230,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
unsigned int i;
bool ret = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
ret = false;
break;
}
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return ret;
}
@@ -2415,7 +2420,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
unsigned int i, idx;
nid_t nid;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap))
@@ -2438,7 +2443,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
out:
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
@@ -2473,7 +2478,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
while (1) {
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
@@ -2488,7 +2493,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
}
if (ret) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@@ -2508,7 +2513,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
/* find free nids from current sum_pages */
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -2953,7 +2958,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
unsigned int valid = 0, nid_ofs = 0;
@@ -2973,7 +2978,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
__update_nat_bits(nm_i, nat_ofs, valid);
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -3071,15 +3076,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* nat_cnt[DIRTY_NAT].
*/
if (cpc->reason & CP_UMOUNT) {
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -3108,7 +3113,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
return err;
@@ -3218,6 +3223,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
+ nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
@@ -3228,7 +3234,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->nid_list_lock);
- init_rwsem(&nm_i->nat_tree_lock);
+ init_f2fs_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -3334,7 +3340,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -3364,7 +3370,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
kvfree(nm_i->nat_block_bitmap);
if (nm_i->free_nid_bitmap) {
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 18b98cf0465b..4c1d34bfea78 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -31,6 +31,9 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
+/* control total # of node writes used for roll-fowrad recovery */
+#define DEF_RF_NODE_BLOCKS 0
+
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 79773d322c47..3cb7f8a43b4d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -56,6 +56,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
+ if (NM_I(sbi)->max_rf_node_blocks &&
+ percpu_counter_sum_positive(&sbi->rf_node_block_count) >=
+ NM_I(sbi)->max_rf_node_blocks)
+ return false;
return true;
}
@@ -343,6 +347,19 @@ static int recover_inode(struct inode *inode, struct page *page)
return 0;
}
+static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
+ unsigned int ra_blocks, unsigned int blkaddr,
+ unsigned int next_blkaddr)
+{
+ if (blkaddr + 1 == next_blkaddr)
+ ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
+ ra_blocks * 2);
+ else if (next_blkaddr % sbi->blocks_per_seg)
+ ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
+ ra_blocks / 2);
+ return ra_blocks;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
bool check_only)
{
@@ -350,6 +367,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
struct page *page = NULL;
block_t blkaddr;
unsigned int loop_cnt = 0;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
valid_user_blocks(sbi);
int err = 0;
@@ -424,11 +442,14 @@ next:
break;
}
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
return err;
}
@@ -704,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct page *page = NULL;
int err = 0;
block_t blkaddr;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -715,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
-
page = f2fs_get_tmp_page(sbi, blkaddr);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -759,9 +779,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
next:
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
+
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
f2fs_allocate_new_segments(sbi);
@@ -796,7 +821,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
@@ -845,7 +870,7 @@ skip:
if (!err)
clear_sbi_flag(sbi, SBI_POR_DOING);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1dabc8244083..012524db7437 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -471,7 +471,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
f2fs_balance_fs(sbi, true);
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
f2fs_lock_op(sbi);
set_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -483,7 +483,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
clear_inode_flag(inode, FI_ATOMIC_COMMIT);
f2fs_unlock_op(sbi);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
return err;
}
@@ -521,7 +521,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
io_schedule();
finish_wait(&sbi->gc_thread->fggc_wq, &wait);
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_gc(sbi, false, false, false, NULL_SEGNO);
}
}
@@ -529,7 +529,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
{
- int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
+ int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS);
unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -570,7 +570,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
/* there is background inflight IO or foreground operation recently */
if (is_inflight_io(sbi, REQ_TIME) ||
- (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
+ (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem)))
return;
/* exceed periodical checkpoint timeout threshold */
@@ -1156,14 +1156,14 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->ordered = false;
dpolicy->granularity = granularity;
- dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+ dpolicy->max_requests = dcc->max_discard_request;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
@@ -1171,12 +1171,12 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->granularity = 1;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
- DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->min_discard_issue_time;
}
} else if (discard_type == DPOLICY_FORCE) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_FSTRIM) {
dpolicy->io_aware = false;
@@ -1781,7 +1781,7 @@ static int issue_discard_thread(void *data)
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
wait_queue_head_t *q = &dcc->discard_wait_queue;
struct discard_policy dpolicy;
- unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ unsigned int wait_ms = dcc->min_discard_issue_time;
int issued;
set_freezable();
@@ -2180,6 +2180,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
+ dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
+ dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2821,7 +2825,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
if (!sbi->am.atgc_enabled)
return;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -2831,7 +2835,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
up_write(&SIT_I(sbi)->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
@@ -2982,7 +2986,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -3006,7 +3010,7 @@ unlock:
type, segno, curseg->segno);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
@@ -3038,23 +3042,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi,
void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
__allocate_new_section(sbi, type, force);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
__allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static const struct segment_allocation default_salloc_ops = {
@@ -3192,9 +3196,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (sbi->discard_blks == 0)
goto out;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
goto out;
@@ -3431,7 +3435,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
@@ -3514,7 +3518,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3550,7 +3554,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
- down_read(&fio->sbi->io_order_lock);
+ f2fs_down_read(&fio->sbi->io_order_lock);
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
@@ -3570,7 +3574,7 @@ reallocate:
f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
if (keep_order)
- up_read(&fio->sbi->io_order_lock);
+ f2fs_up_read(&fio->sbi->io_order_lock);
}
void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -3705,7 +3709,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
se = get_seg_entry(sbi, segno);
type = se->type;
- down_write(&SM_I(sbi)->curseg_lock);
+ f2fs_down_write(&SM_I(sbi)->curseg_lock);
if (!recover_curseg) {
/* for recovery flow */
@@ -3774,7 +3778,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_write(&SM_I(sbi)->curseg_lock);
+ f2fs_up_write(&SM_I(sbi)->curseg_lock);
}
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -4789,6 +4793,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
sanity_check_seg_type(sbi, curseg->seg_type);
+ if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) {
+ f2fs_err(sbi,
+ "Current segment has invalid alloc_type:%d",
+ curseg->alloc_type);
+ return -EFSCORRUPTED;
+ }
+
if (f2fs_test_bit(blkofs, se->cur_valid_map))
goto out;
@@ -5258,7 +5269,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sm_info->sit_entry_set);
- init_rwsem(&sm_info->curseg_lock);
+ init_f2fs_rwsem(&sm_info->curseg_lock);
if (!f2fs_readonly(sbi->sb)) {
err = f2fs_create_flush_cmd_control(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 0291cd55cf09..5c94caf0c0a1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -651,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* pages over min_fsync_blocks. (=default option)
* F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
* F2FS_IPU_NOCACHE - disable IPU bio cache.
- * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
+ * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has
+ * FI_OPU_WRITE flag.
+ * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
@@ -667,6 +669,7 @@ enum {
F2FS_IPU_FSYNC,
F2FS_IPU_ASYNC,
F2FS_IPU_NOCACHE,
+ F2FS_IPU_HONOR_OPU_WRITE,
};
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index baefd398ec1a..769d18a2041f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1355,16 +1355,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
- init_rwsem(&fi->i_sem);
+ init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
- init_rwsem(&fi->i_gc_rwsem[READ]);
- init_rwsem(&fi->i_gc_rwsem[WRITE]);
- init_rwsem(&fi->i_xattr_sem);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
+ init_f2fs_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -1501,8 +1501,9 @@ static void f2fs_free_inode(struct inode *inode)
static void destroy_percpu_info(struct f2fs_sb_info *sbi)
{
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
}
static void destroy_device_list(struct f2fs_sb_info *sbi)
@@ -1662,11 +1663,15 @@ static int f2fs_freeze(struct super_block *sb)
/* ensure no checkpoint required */
if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
return -EINVAL;
+
+ /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
+ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
{
+ clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
@@ -2075,6 +2080,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
+ unsigned int gc_mode;
int err = 0;
int ret;
block_t unusable;
@@ -2087,8 +2093,11 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_update_time(sbi, DISABLE_TIME);
+ gc_mode = sbi->gc_mode;
+ sbi->gc_mode = GC_URGENT_HIGH;
+
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err == -ENODATA) {
err = 0;
@@ -2110,7 +2119,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
err = f2fs_write_checkpoint(sbi, &cpc);
@@ -2122,8 +2131,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
spin_unlock(&sbi->stat_lock);
out_unlock:
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
restore_flag:
+ sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -2142,12 +2152,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
if (unlikely(retry < 0))
f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
clear_sbi_flag(sbi, SBI_CP_DISABLED);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
}
@@ -2688,7 +2698,7 @@ int f2fs_quota_sync(struct super_block *sb, int type)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
- int ret;
+ int ret = 0;
/*
* Now when everything is written we can discard the pagecache so
@@ -2699,26 +2709,26 @@ int f2fs_quota_sync(struct super_block *sb, int type)
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_active(sb, type))
- return 0;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
* f2fs_quota_sync
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
* dquot_writeback_dquots()
* f2fs_dquot_commit
* block_operation
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
*/
f2fs_lock_op(sbi);
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = f2fs_quota_sync_file(sbi, cnt);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
inode_unlock(dqopt->files[cnt]);
@@ -2843,11 +2853,11 @@ static int f2fs_dquot_commit(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
+ f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
ret = dquot_commit(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -2856,11 +2866,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = dquot_acquire(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -3574,6 +3584,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->gc_mode = GC_NORMAL;
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
@@ -3601,14 +3612,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
- init_rwsem(&sbi->io_order_lock);
+ init_f2fs_rwsem(&sbi->io_order_lock);
spin_lock_init(&sbi->cp_lock);
sbi->dirty_device = 0;
spin_lock_init(&sbi->dev_lock);
- init_rwsem(&sbi->sb_lock);
- init_rwsem(&sbi->pin_sem);
+ init_f2fs_rwsem(&sbi->sb_lock);
+ init_f2fs_rwsem(&sbi->pin_sem);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -3619,11 +3630,20 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
if (err)
return err;
+ err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL);
+ if (err)
+ goto err_valid_block;
+
err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
GFP_KERNEL);
if (err)
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ goto err_node_block;
+ return 0;
+err_node_block:
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+err_valid_block:
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
return err;
}
@@ -3957,7 +3977,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
if (f2fs_block_unit_discard(sbi))
sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+ sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ 1 << F2FS_IPU_HONOR_OPU_WRITE;
}
sbi->readdir_ra = 1;
@@ -4067,11 +4088,11 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_rwsem(&sbi->gc_lock);
+ init_f2fs_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
- init_rwsem(&sbi->cp_global_sem);
- init_rwsem(&sbi->node_write);
- init_rwsem(&sbi->node_change);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
@@ -4092,18 +4113,18 @@ try_onemore:
}
for (j = HOT; j < n; j++) {
- init_rwsem(&sbi->write_io[i][j].io_rwsem);
+ init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
sbi->write_io[i][j].sbi = sbi;
sbi->write_io[i][j].bio = NULL;
spin_lock_init(&sbi->write_io[i][j].io_lock);
INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
}
}
- init_rwsem(&sbi->cp_rwsem);
- init_rwsem(&sbi->quota_sem);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
@@ -4528,7 +4549,7 @@ static struct file_system_type f2fs_fs_type = {
.name = "f2fs",
.mount = f2fs_mount,
.kill_sb = kill_f2fs_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("f2fs");
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 8ac506671245..4c50aedd5144 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -41,6 +41,16 @@ enum {
ATGC_INFO, /* struct atgc_management */
};
+static const char *gc_mode_names[MAX_GC_MODE] = {
+ "GC_NORMAL",
+ "GC_IDLE_CB",
+ "GC_IDLE_GREEDY",
+ "GC_IDLE_AT",
+ "GC_URGENT_HIGH",
+ "GC_URGENT_LOW",
+ "GC_URGENT_MID"
+};
+
struct f2fs_attr {
struct attribute attr;
ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
@@ -316,8 +326,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
+ if (!strcmp(a->attr.name, "gc_urgent"))
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_mode]);
+
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_segment_mode]);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -363,7 +378,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
ret = f2fs_update_extension_list(sbi, name, hot, set);
if (ret)
@@ -373,7 +388,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (ret)
f2fs_update_extension_list(sbi, name, hot, !set);
out:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
return ret ? ret : count;
}
@@ -468,6 +483,13 @@ out:
}
} else if (t == 2) {
sbi->gc_mode = GC_URGENT_LOW;
+ } else if (t == 3) {
+ sbi->gc_mode = GC_URGENT_MID;
+ if (sbi->gc_thread) {
+ sbi->gc_thread->gc_wake = 1;
+ wake_up_interruptible_all(
+ &sbi->gc_thread->gc_wait_queue_head);
+ }
} else {
return -EINVAL;
}
@@ -481,7 +503,7 @@ out:
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
- sbi->gc_mode = GC_AT;
+ sbi->gc_mode = GC_IDLE_AT;
} else {
sbi->gc_mode = GC_NORMAL;
}
@@ -716,6 +738,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
@@ -728,6 +754,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -832,6 +859,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reclaim_segments),
ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(max_discard_request),
+ ATTR_LIST(min_discard_issue_time),
+ ATTR_LIST(mid_discard_issue_time),
+ ATTR_LIST(max_discard_issue_time),
ATTR_LIST(discard_granularity),
ATTR_LIST(pending_discard),
ATTR_LIST(batched_trim_sections),
@@ -847,6 +878,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
ATTR_LIST(dirty_nats_ratio),
+ ATTR_LIST(max_roll_forward_node_blocks),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index fe5acdccaae1..3d793202cc9f 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -208,7 +208,7 @@ cleanup:
* from re-instantiating cached pages we are truncating (since unlike
* normal file accesses, garbage collection isn't limited by i_size).
*/
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
truncate_inode_pages(inode->i_mapping, inode->i_size);
err2 = f2fs_truncate(inode);
if (err2) {
@@ -216,7 +216,7 @@ cleanup:
err2);
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
return err ?: err2;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8e5cd9c916ff..c76c15086e5f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -525,10 +525,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
&entry, &base_addr, &base_size, &is_inline);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -562,9 +562,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error;
size_t rest = buffer_size;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -786,9 +786,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- down_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
- up_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8d7fe6db989..33d54c9fbefc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1903,8 +1903,7 @@ static long writeback_sb_inodes(struct super_block *sb,
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
- if (current->plug)
- blk_flush_plug(current->plug, false);
+ blk_flush_plug(current->plug, false);
cond_resched();
}
@@ -2301,8 +2300,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
/*
* If we are expecting writeback progress we must submit plugged IO.
*/
- if (blk_needs_flush_plug(current))
- blk_flush_plug(current->plug, true);
+ blk_flush_plug(current->plug, true);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cd54a529460d..592730fd6e42 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -941,7 +941,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
while (count) {
if (cs->write && cs->pipebufs && page) {
- return fuse_ref_page(cs, page, offset, count);
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->args->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
} else if (!cs->len) {
if (cs->move_pages && page &&
offset == 0 && count == PAGE_SIZE) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 829094451774..0fc150c1c50b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1413,6 +1413,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e8e59fbdefeb..eac4984cc753 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,6 +256,7 @@ struct fuse_args {
bool nocreds:1;
bool in_pages:1;
bool out_pages:1;
+ bool user_pages:1;
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ee846ce371d8..9ee36aa73251 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,7 @@
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/pid_namespace.h>
+#include <uapi/linux/magic.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh,
"Global limit for the maximum congestion threshold an "
"unprivileged user can set");
-#define FUSE_SUPER_MAGIC 0x65735546
-
#define FUSE_DEFAULT_BLKSIZE 512
/** Maximum number of outstanding background requests */
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index fbc09dab1f85..df58966bc874 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.out_args[1].value = ptr;
err = fuse_simple_request(fm, &args);
- if (!err && outarg.flags & FUSE_IOCTL_RETRY)
- err = -EIO;
-
+ if (!err) {
+ if (outarg.result < 0)
+ err = outarg.result;
+ else if (outarg.flags & FUSE_IOCTL_RETRY)
+ err = -EIO;
+ }
return err;
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ca0bb3a73912..4ae1eefae616 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -265,10 +265,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
bio_end_io_t *end_io)
{
struct super_block *sb = sdp->sd_vfs;
- struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
+ struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO);
bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
- bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = end_io;
bio->bi_private = sdp;
@@ -489,10 +488,9 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
{
struct bio *new;
- new = bio_alloc(GFP_NOIO, nr_iovecs);
- bio_copy_dev(new, prev);
+ new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
new->bi_write_hint = prev->bi_write_hint;
bio_chain(new, prev);
submit_bio(prev);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 72d30a682ece..a580b90b7522 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -222,9 +222,8 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
struct buffer_head *bh = *bhs;
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, num);
+ bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
while (num > 0) {
bh = *bhs;
if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
@@ -235,7 +234,6 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
num--;
}
bio->bi_end_io = gfs2_meta_read_endio;
- bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
}
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7f8410d8fdc1..c9b423c874a3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -251,14 +251,12 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
ClearPageDirty(page);
lock_page(page);
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS);
bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- bio_set_dev(bio, sb->s_bdev);
bio_add_page(bio, page, PAGE_SIZE, 0);
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a6002b2d146d..d87ea98cf535 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/gfs2_ondisk.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5beb82652435..8082eb01127c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -9,7 +9,7 @@
*/
#include <linux/cdrom.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/nls.h>
#include <linux/slab.h>
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 51ae6f1eb4a5..0b8ad6586df5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/cdrom.h>
-#include <linux/genhd.h>
#include <asm/unaligned.h>
#include "hfsplus_fs.h"
@@ -64,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
- bio = bio_alloc(GFP_NOIO, 1);
+ bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, sb->s_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (op != WRITE && data)
*data = (u8 *)buf + offset;
diff --git a/fs/internal.h b/fs/internal.h
index 8590c973c2f4..56c0477f4215 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -184,7 +184,9 @@ int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+
+int getname_statx_lookup_flags(int flags);
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index bb7f161bb19c..5b93fa67d346 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -76,6 +76,7 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long flags;
};
@@ -91,7 +92,7 @@ enum {
*/
struct io_wqe {
raw_spinlock_t lock;
- struct io_wqe_acct acct[2];
+ struct io_wqe_acct acct[IO_WQ_ACCT_NR];
int node;
@@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- preempt_disable();
+ raw_spin_unlock(&wqe->lock);
io_wqe_dec_running(worker);
worker->flags = 0;
+ preempt_disable();
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
+ bool ret = false;
+
+ raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- return true;
- return false;
+ ret = true;
+ raw_spin_unlock(&acct->lock);
+
+ return ret;
}
/*
@@ -385,7 +391,6 @@ fail:
}
static void io_wqe_dec_running(struct io_worker *worker)
- __must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
- if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- raw_spin_unlock(&wqe->lock);
- io_queue_worker_create(worker, acct, create_worker_cb);
- raw_spin_lock(&wqe->lock);
- }
+ if (!atomic_dec_and_test(&acct->nr_running))
+ return;
+ if (!io_acct_run_queue(acct))
+ return;
+
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(worker, acct, create_worker_cb);
}
/*
@@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker)
* it's currently on the freelist
*/
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
- __must_hold(wqe->lock)
{
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
+ raw_spin_lock(&wqe->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
+ raw_spin_unlock(&wqe->lock);
}
}
@@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
- __must_hold(wqe->lock)
+ __must_hold(acct->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
@@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
unstalled = io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
if (unstalled) {
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wqe->wq->hash->wait))
@@ -538,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
- __releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -555,7 +561,9 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
+ raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
+ raw_spin_unlock(&acct->lock);
if (work) {
__io_worker_busy(wqe, worker);
@@ -569,10 +577,9 @@ static void io_worker_handle_work(struct io_worker *worker)
raw_spin_lock(&worker->lock);
worker->next_work = work;
raw_spin_unlock(&worker->lock);
- }
- raw_spin_unlock(&wqe->lock);
- if (!work)
+ } else {
break;
+ }
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -608,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait);
}
} while (work);
-
- raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -633,12 +638,10 @@ static int io_wqe_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
-loop:
- raw_spin_lock(&wqe->lock);
- if (io_acct_run_queue(acct)) {
+ while (io_acct_run_queue(acct))
io_worker_handle_work(worker);
- goto loop;
- }
+
+ raw_spin_lock(&wqe->lock);
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
@@ -662,10 +665,8 @@ loop:
last_timeout = !ret;
}
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock(&wqe->lock);
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_worker_handle_work(worker);
- }
audit_free(current);
io_worker_exit(worker);
@@ -705,10 +706,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
return;
worker->flags &= ~IO_WORKER_F_RUNNING;
-
- raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock(&worker->wqe->lock);
}
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
@@ -778,10 +776,12 @@ static void create_worker_cont(struct callback_head *cb)
.cancel_all = true,
};
+ raw_spin_unlock(&wqe->lock);
while (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ ;
+ } else {
+ raw_spin_unlock(&wqe->lock);
}
- raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return;
@@ -914,6 +914,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ struct io_cb_cancel_data match;
unsigned work_flags = work->flags;
bool do_create;
@@ -927,10 +928,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
io_wqe_insert_work(wqe, work);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ raw_spin_unlock(&acct->lock);
+ raw_spin_lock(&wqe->lock);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
@@ -946,18 +949,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
raw_spin_lock(&wqe->lock);
- /* fatal condition, failed to create the first worker */
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_item,
- .data = work,
- .cancel_all = false,
- };
-
- if (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers) {
+ raw_spin_unlock(&wqe->lock);
+ return;
}
raw_spin_unlock(&wqe->lock);
+
+ /* fatal condition, failed to create the first worker */
+ match.fn = io_wq_work_match_item,
+ match.data = work,
+ match.cancel_all = false,
+
+ io_acct_cancel_pending_work(wqe, acct, &match);
}
}
@@ -1032,22 +1035,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
- __releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
+ raw_spin_lock(&acct->lock);
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
+ raw_spin_unlock(&acct->lock);
return false;
}
@@ -1061,7 +1065,6 @@ retry:
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
- raw_spin_lock(&wqe->lock);
if (match->cancel_all)
goto retry;
break;
@@ -1103,13 +1106,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- if (match.nr_pending && !match.cancel_all) {
- raw_spin_unlock(&wqe->lock);
+ if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
- }
+ raw_spin_lock(&wqe->lock);
io_wqe_cancel_running_work(wqe, &match);
raw_spin_unlock(&wqe->lock);
if (match.nr_running && !match.cancel_all)
@@ -1190,6 +1191,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
+ raw_spin_lock_init(&acct->lock);
}
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
@@ -1282,9 +1284,7 @@ static void io_wq_destroy(struct io_wq *wq)
.fn = io_wq_work_match_all,
.cancel_all = true,
};
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- raw_spin_unlock(&wqe->lock);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
@@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..496a2af7d12c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,6 +63,7 @@
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/busy_poll.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -263,11 +264,18 @@ struct io_rsrc_data {
bool quiesce;
};
+struct io_buffer_list {
+ struct list_head list;
+ struct list_head buf_list;
+ __u16 bgid;
+};
+
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
+ __u16 bgid;
};
struct io_restriction {
@@ -326,6 +334,14 @@ struct io_submit_state {
struct blk_plug plug;
};
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
+};
+
+#define IO_BUFFERS_HASH_BITS 5
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -335,11 +351,11 @@ struct io_ring_ctx {
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
- unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -378,7 +394,9 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct xarray io_buffers;
+ struct list_head *io_buffers;
+ struct list_head io_buffers_cache;
+ struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
unsigned sq_thread_idle;
@@ -395,11 +413,16 @@ struct io_ring_ctx {
struct list_head sqd_list;
unsigned long check_cq_overflow;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ struct list_head napi_list;
+ spinlock_t napi_lock; /* napi_list lock */
+#endif
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
+ struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -421,6 +444,8 @@ struct io_ring_ctx {
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
struct io_restriction restrictions;
@@ -436,6 +461,8 @@ struct io_ring_ctx {
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
};
/* Keep this last, we don't need it for the fast path */
@@ -461,6 +488,11 @@ struct io_ring_ctx {
};
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -476,6 +508,7 @@ struct io_uring_task {
struct io_wq_work_list task_list;
struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -642,7 +675,7 @@ struct io_statx {
int dfd;
unsigned int mask;
unsigned int flags;
- const char __user *filename;
+ struct filename *filename;
struct statx __user *buffer;
};
@@ -690,6 +723,12 @@ struct io_hardlink {
int flags;
};
+struct io_msg {
+ struct file *file;
+ u64 user_data;
+ u32 len;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -741,6 +780,8 @@ enum {
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_ASYNC_DATA_BIT,
REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -799,6 +840,10 @@ enum {
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
};
struct async_poll {
@@ -825,7 +870,7 @@ enum {
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
- * or directly as just 'ki_filp' in this struct.
+ * or directly as just 'file' in this struct.
*/
struct io_kiocb {
union {
@@ -855,6 +900,7 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
+ struct io_msg msg;
};
u8 opcode;
@@ -877,6 +923,7 @@ struct io_kiocb {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
atomic_t refs;
+ atomic_t poll_refs;
struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
@@ -885,12 +932,11 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- struct io_wq_work work;
/* custom credentials, valid IFF REQ_F_CREDS is set */
- const struct cred *creds;
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
- atomic_t poll_refs;
+ const struct cred *creds;
+ struct io_wq_work work;
};
struct io_tctx_node {
@@ -1105,6 +1151,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_MSG_RING] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -1141,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+static void io_eventfd_signal(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1267,36 +1317,88 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
}
}
-static unsigned int __io_put_kbuf(struct io_kiocb *req)
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
struct io_buffer *kbuf = req->kbuf;
unsigned int cflags;
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
+ cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- kfree(kbuf);
+ list_add(&kbuf->list, list);
req->kbuf = NULL;
return cflags;
}
-static inline unsigned int io_put_kbuf(struct io_kiocb *req)
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbuf(req);
+ return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
-static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
{
- bool got = percpu_ref_tryget(ref);
+ unsigned int cflags;
- /* already at zero, wait for ->release() */
- if (!got)
- wait_for_completion(compl);
- percpu_ref_resurrect(ref);
- if (got)
- percpu_ref_put(ref);
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+
+ /*
+ * We can add this buffer back to two lists:
+ *
+ * 1) The io_buffers_cache list. This one is protected by the
+ * ctx->uring_lock. If we already hold this lock, add back to this
+ * list as we can grab it from issue as well.
+ * 2) The io_buffers_comp list. This one is protected by the
+ * ctx->completion_lock.
+ *
+ * We migrate buffers from the comp_list to the issue cache list
+ * when we need one.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
+ }
+
+ return cflags;
+}
+
+static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
+{
+ struct list_head *hash_list;
+ struct io_buffer_list *bl;
+
+ hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ list_for_each_entry(bl, hash_list, list)
+ if (bl->bgid == bgid || bgid == -1U)
+ return bl;
+
+ return NULL;
+}
+
+static void io_kbuf_recycle(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ struct io_buffer *buf;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ buf = req->kbuf;
+ bl = io_buffer_get_list(ctx, buf->bgid);
+ list_add(&buf->list, &bl->buf_list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -1409,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int hash_bits;
+ int i, hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -1436,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
+ ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
+ sizeof(struct list_head), GFP_KERNEL);
+ if (!ctx->io_buffers)
+ goto err;
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
+ INIT_LIST_HEAD(&ctx->io_buffers[i]);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1444,14 +1553,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_LIST_HEAD(&ctx->apoll_cache);
init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_pages);
+ INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1464,10 +1576,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ INIT_LIST_HEAD(&ctx->napi_list);
+ spin_lock_init(&ctx->napi_lock);
+#endif
return ctx;
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
+ kfree(ctx->io_buffers);
kfree(ctx);
return NULL;
}
@@ -1610,8 +1727,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
+ trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+ &req->work, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1681,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
-}
-
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+ if (ctx->off_timeout_used || ctx->drain_active) {
+ spin_lock(&ctx->completion_lock);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+}
+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@@ -1726,23 +1848,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[tail & mask];
}
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
- return false;
+ struct io_ev_fd *ev_fd;
+
+ rcu_read_lock();
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return false;
- return !ctx->eventfd_async || io_wq_current_is_worker();
+ goto out;
+
+ if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+ rcu_read_unlock();
}
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
@@ -1751,21 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
+
+ io_cqring_wake(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
- }
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1905,8 +2049,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
{
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, user_data, res, cflags);
-
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
@@ -1922,16 +2064,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+}
+
static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
@@ -1941,7 +2090,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -2000,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res)
static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
- io_req_complete_post(req, res, 0);
+ io_req_complete_post(req, res, io_put_kbuf(req, 0));
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -2183,7 +2332,9 @@ static void io_fail_links(struct io_kiocb *req)
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req, link);
+ trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ req->opcode, link);
+
if (!ignore_cqes) {
link->flags &= ~REQ_F_CQE_SKIP;
io_fill_cqe_req(link, res, 0);
@@ -2302,7 +2453,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
if (likely(*uring_locked))
req->io_task_work.func(req, uring_locked);
else
- __io_req_complete_post(req, req->result, io_put_kbuf(req));
+ __io_req_complete_post(req, req->result,
+ io_put_kbuf_comp(req));
node = next;
} while (node);
@@ -2530,8 +2682,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, req->result,
- req->cflags);
+ __io_fill_cqe_req(req, req->result, req->cflags);
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
}
io_commit_cqring(ctx);
@@ -2653,7 +2813,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
- __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req));
+ __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
nr_events++;
}
@@ -2829,14 +2989,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- unsigned int cflags = io_put_kbuf(req);
int res = req->result;
if (*locked) {
- io_req_complete_state(req, res, cflags);
+ io_req_complete_state(req, res, io_put_kbuf(req, 0));
io_req_add_compl_list(req);
} else {
- io_req_complete_post(req, res, cflags);
+ io_req_complete_post(req, res,
+ io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
}
@@ -2845,7 +3005,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result,
+ io_put_kbuf(req, issue_flags));
}
static void io_complete_rw(struct kiocb *kiocb, long res)
@@ -3000,14 +3161,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1) {
- if (!(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- } else {
- kiocb->ki_pos = 0;
- }
- }
kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
@@ -3074,6 +3227,24 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ bool is_stream = req->file->f_mode & FMODE_STREAM;
+
+ if (kiocb->ki_pos == -1) {
+ if (!is_stream) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ return &kiocb->ki_pos;
+ } else {
+ kiocb->ki_pos = 0;
+ return NULL;
+ }
+ }
+ return is_stream ? NULL : &kiocb->ki_pos;
+}
+
static void kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
@@ -3096,14 +3267,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req))
io_req_task_queue_reissue(req);
- } else {
- req_set_fail(req);
- req->result = ret;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, false);
- }
+ else
+ io_req_task_queue_fail(req, ret);
}
}
@@ -3201,30 +3368,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
mutex_lock(&ctx->uring_lock);
}
+static void io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
+{
+ struct list_head *list;
+
+ list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ INIT_LIST_HEAD(&bl->buf_list);
+ bl->bgid = bgid;
+ list_add(&bl->list, list);
+}
+
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
int bgid, unsigned int issue_flags)
{
struct io_buffer *kbuf = req->kbuf;
- struct io_buffer *head;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
- io_ring_submit_lock(req->ctx, needs_lock);
+ io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&req->ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
- head = xa_load(&req->ctx->io_buffers, bgid);
- if (head) {
- if (!list_empty(&head->list)) {
- kbuf = list_last_entry(&head->list, struct io_buffer,
- list);
- list_del(&kbuf->list);
- } else {
- kbuf = head;
- xa_erase(&req->ctx->io_buffers, bgid);
- }
+ bl = io_buffer_get_list(ctx, bgid);
+ if (bl && !list_empty(&bl->buf_list)) {
+ kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+ list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
@@ -3400,6 +3573,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
+ loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3412,6 +3586,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
+ ppos = io_kiocb_ppos(kiocb);
+
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3425,10 +3601,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3436,13 +3612,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ ret += nr;
if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
- req->rw.len -= nr;
req->rw.addr += nr;
+ req->rw.len -= nr;
+ if (!req->rw.len)
+ break;
}
- ret += nr;
if (nr != iovec.iov_len)
break;
}
@@ -3629,12 +3807,23 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_async_rw *rw;
ssize_t ret, ret2;
+ loff_t *ppos;
if (!req_has_async_data(req)) {
ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
} else {
+ /*
+ * Safe and required to re-import if we're using provided
+ * buffers, as we dropped the selected one before retry.
+ */
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ }
+
rw = req->async_data;
s = &rw->s;
/*
@@ -3659,7 +3848,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3669,6 +3860,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
+ /* if we can poll, just do that */
+ if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
@@ -3758,6 +3952,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ssize_t ret, ret2;
+ loff_t *ppos;
if (!req_has_async_data(req)) {
ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
@@ -3788,7 +3983,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -4235,6 +4432,45 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_msg_ring_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
+ sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ return -EINVAL;
+
+ if (req->file->f_op != &io_uring_fops)
+ return -EBADFD;
+
+ req->msg.user_data = READ_ONCE(sqe->off);
+ req->msg.len = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx;
+ struct io_msg *msg = &req->msg;
+ int ret = -EOVERFLOW;
+ bool filled;
+
+ target_ctx = req->file->private_data;
+
+ spin_lock(&target_ctx->completion_lock);
+ filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len,
+ IORING_CQE_F_MSG);
+ io_commit_cqring(target_ctx);
+ spin_unlock(&target_ctx->completion_lock);
+
+ if (filled) {
+ io_cqring_ev_posted(target_ctx);
+ ret = 0;
+ }
+
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
+}
+
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -4458,8 +4694,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
- int bgid, unsigned nbufs)
+static int __io_remove_buffers(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
@@ -4468,19 +4704,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return 0;
/* the head kbuf is the list itself */
- while (!list_empty(&buf->list)) {
+ while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
- nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
- kfree(nxt);
if (++i == nbufs)
return i;
cond_resched();
}
i++;
- kfree(buf);
- xa_erase(&ctx->io_buffers, bgid);
return i;
}
@@ -4489,7 +4722,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head;
+ struct io_buffer_list *bl;
int ret = 0;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
@@ -4498,9 +4731,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
lockdep_assert_held(&ctx->uring_lock);
ret = -ENOENT;
- head = xa_load(&ctx->io_buffers, p->bgid);
- if (head)
- ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (bl)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
if (ret < 0)
req_set_fail(req);
@@ -4545,38 +4778,80 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
+{
+ struct io_buffer *buf;
+ struct page *page;
+ int bufs_in_page;
+
+ /*
+ * Completions that don't happen inline (eg not under uring_lock) will
+ * add to ->io_buffers_comp. If we don't have any free buffers, check
+ * the completion list and splice those entries first.
+ */
+ if (!list_empty_careful(&ctx->io_buffers_comp)) {
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp)) {
+ list_splice_init(&ctx->io_buffers_comp,
+ &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+ return 0;
+ }
+ spin_unlock(&ctx->completion_lock);
+ }
+
+ /*
+ * No free buffers and no completion entries either. Allocate a new
+ * page worth of buffer entries and add those to our freelist.
+ */
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+
+ list_add(&page->lru, &ctx->io_buffers_pages);
+
+ buf = page_address(page);
+ bufs_in_page = PAGE_SIZE / sizeof(*buf);
+ while (bufs_in_page) {
+ list_add_tail(&buf->list, &ctx->io_buffers_cache);
+ buf++;
+ bufs_in_page--;
+ }
+
+ return 0;
+}
+
+static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
+ struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
- buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
- if (!buf)
+ if (list_empty(&ctx->io_buffers_cache) &&
+ io_refill_buffer_cache(ctx))
break;
-
+ buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
+ list);
+ list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
+ buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
- if (!*head) {
- INIT_LIST_HEAD(&buf->list);
- *head = buf;
- } else {
- list_add_tail(&buf->list, &(*head)->list);
- }
+ cond_resched();
}
- return i ? i : -ENOMEM;
+ return i ? 0 : -ENOMEM;
}
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head, *list;
+ struct io_buffer_list *bl;
int ret = 0;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
@@ -4584,14 +4859,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
lockdep_assert_held(&ctx->uring_lock);
- list = head = xa_load(&ctx->io_buffers, p->bgid);
-
- ret = io_add_buffers(p, &head);
- if (ret >= 0 && !list) {
- ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
- if (ret < 0)
- __io_remove_buffers(ctx, head, p->bgid, -1U);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (unlikely(!bl)) {
+ bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ io_buffer_add_list(ctx, bl, p->bgid);
}
+
+ ret = io_add_buffers(ctx, p, bl);
+err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
@@ -4721,6 +5000,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ const char __user *path;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
@@ -4730,10 +5011,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->statx.dfd = READ_ONCE(sqe->fd);
req->statx.mask = READ_ONCE(sqe->len);
- req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->statx.flags = READ_ONCE(sqe->statx_flags);
+ req->statx.filename = getname_flags(path,
+ getname_statx_lookup_flags(req->statx.flags),
+ NULL);
+
+ if (IS_ERR(req->statx.filename)) {
+ int ret = PTR_ERR(req->statx.filename);
+
+ req->statx.filename = NULL;
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -5183,7 +5476,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5238,7 +5531,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
out_free:
req_set_fail(req);
}
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
+
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5257,8 +5551,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->nofile = rlimit(RLIMIT_NOFILE);
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
- (accept->flags & SOCK_CLOEXEC)))
+ if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
return -EINVAL;
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
@@ -5398,6 +5691,108 @@ IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct napi_entry {
+ struct list_head list;
+ unsigned int napi_id;
+ unsigned long timeout;
+};
+
+/*
+ * Add busy poll NAPI ID from sk.
+ */
+static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ struct napi_entry *ne;
+
+ if (!net_busy_loop_on())
+ return;
+
+ sock = sock_from_file(file);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+
+ /* Non-NAPI IDs can be rejected */
+ if (napi_id < MIN_NAPI_ID)
+ return;
+
+ spin_lock(&ctx->napi_lock);
+ list_for_each_entry(ne, &ctx->napi_list, list) {
+ if (ne->napi_id == napi_id) {
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ goto out;
+ }
+ }
+
+ ne = kmalloc(sizeof(*ne), GFP_NOWAIT);
+ if (!ne)
+ goto out;
+
+ ne->napi_id = napi_id;
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ list_add_tail(&ne->list, &ctx->napi_list);
+out:
+ spin_unlock(&ctx->napi_lock);
+}
+
+static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
+{
+ if (time_after(jiffies, ne->timeout)) {
+ list_del(&ne->list);
+ kfree(ne);
+ }
+}
+
+/*
+ * Busy poll if globally on and supporting sockets found
+ */
+static bool io_napi_busy_loop(struct list_head *napi_list)
+{
+ struct napi_entry *ne, *n;
+
+ list_for_each_entry_safe(ne, n, napi_list, list) {
+ napi_busy_loop(ne->napi_id, NULL, NULL, true,
+ BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ }
+ return !list_empty(napi_list);
+}
+
+static void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+ spin_lock(&ctx->napi_lock);
+ while (!list_empty(&ctx->napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(&ctx->napi_list, struct napi_entry,
+ list);
+
+ list_del(&ne->list);
+ kfree(ne);
+ }
+ spin_unlock(&ctx->napi_lock);
+}
+#else
+static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+}
+
+static inline void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
@@ -5473,8 +5868,12 @@ static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
static void io_poll_remove_entries(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = io_poll_get_single(req);
- struct io_poll_iocb *poll_double = io_poll_get_double(req);
+ /*
+ * Nothing to do if neither of those flags are set. Avoid dipping
+ * into the poll/apoll/double cachelines if we can.
+ */
+ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
+ return;
/*
* While we hold the waitqueue lock and the waitqueue is nonempty,
@@ -5492,9 +5891,10 @@ static void io_poll_remove_entries(struct io_kiocb *req)
* In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
- io_poll_remove_entry(poll);
- if (poll_double)
- io_poll_remove_entry(poll_double);
+ if (req->flags & REQ_F_SINGLE_POLL)
+ io_poll_remove_entry(io_poll_get_single(req));
+ if (req->flags & REQ_F_DOUBLE_POLL)
+ io_poll_remove_entry(io_poll_get_double(req));
rcu_read_unlock();
}
@@ -5526,13 +5926,13 @@ static int io_poll_check_events(struct io_kiocb *req)
return -ECANCELED;
if (!req->result) {
- struct poll_table_struct pt = { ._key = poll->events };
+ struct poll_table_struct pt = { ._key = req->cflags };
- req->result = vfs_poll(req->file, &pt) & poll->events;
+ req->result = vfs_poll(req->file, &pt) & req->cflags;
}
/* multishot, just fill an CQE and proceed */
- if (req->result && !(poll->events & EPOLLONESHOT)) {
+ if (req->result && !(req->cflags & EPOLLONESHOT)) {
__poll_t mask = mangle_poll(req->result & poll->events);
bool filled;
@@ -5544,6 +5944,7 @@ static int io_poll_check_events(struct io_kiocb *req)
if (unlikely(!filled))
return -ECANCELED;
io_cqring_ev_posted(ctx);
+ io_add_napi(req->file, ctx);
} else if (req->result) {
return 0;
}
@@ -5602,29 +6003,36 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask)
+static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
{
req->result = mask;
+ /*
+ * This is useful for poll that is armed on behalf of another
+ * request, and where the wakeup path could be on a different
+ * CPU. We want to avoid pulling in req->apoll->events for that
+ * case.
+ */
+ req->cflags = events;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
req->io_task_work.func = io_apoll_task_func;
- trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+ trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
io_req_task_work_add(req, false);
}
-static inline void io_poll_execute(struct io_kiocb *req, int res)
+static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
{
if (io_poll_get_ownership(req))
- __io_poll_execute(req, res);
+ __io_poll_execute(req, res, events);
}
static void io_poll_cancel_req(struct io_kiocb *req)
{
io_poll_mark_cancelled(req);
/* kick tw, which should complete the request */
- io_poll_execute(req, 0);
+ io_poll_execute(req, 0, 0);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -5638,7 +6046,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (unlikely(mask & POLLFREE)) {
io_poll_mark_cancelled(req);
/* we have to kick tw in case it's not already */
- io_poll_execute(req, 0);
+ io_poll_execute(req, 0, poll->events);
/*
* If the waitqueue is being freed early but someone is already
@@ -5668,8 +6076,9 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
poll->head = NULL;
+ req->flags &= ~REQ_F_SINGLE_POLL;
}
- __io_poll_execute(req, mask);
+ __io_poll_execute(req, mask, poll->events);
}
return 1;
}
@@ -5704,12 +6113,14 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -ENOMEM;
return;
}
+ req->flags |= REQ_F_DOUBLE_POLL;
io_init_poll_iocb(poll, first->events, first->wait.func);
*poll_ptr = poll;
if (req->opcode == IORING_OP_POLL_ADD)
req->flags |= REQ_F_ASYNC_DATA;
}
+ req->flags |= REQ_F_SINGLE_POLL;
pt->nr_entries++;
poll->head = head;
poll->wait.private = req;
@@ -5773,9 +6184,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
/* can't multishot if failed, just queue the event we've got */
if (unlikely(ipt->error || !ipt->nr_entries))
poll->events |= EPOLLONESHOT;
- __io_poll_execute(req, mask);
+ __io_poll_execute(req, mask, poll->events);
return 0;
}
+ io_add_napi(req->file, req->ctx);
/*
* Release ownership. If someone tried to queue a tw while it was
@@ -5783,7 +6195,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
*/
v = atomic_dec_return(&req->poll_refs);
if (unlikely(v & IO_POLL_REF_MASK))
- __io_poll_execute(req, 0);
+ __io_poll_execute(req, 0, poll->events);
return 0;
}
@@ -5802,7 +6214,7 @@ enum {
IO_APOLL_READY
};
-static int io_arm_poll_handler(struct io_kiocb *req)
+static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
@@ -5827,9 +6239,16 @@ static int io_arm_poll_handler(struct io_kiocb *req)
mask |= POLLOUT | POLLWRNORM;
}
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del_init(&apoll->poll.wait.entry);
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return IO_APOLL_ABORTED;
+ }
apoll->double_poll = NULL;
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
@@ -5839,7 +6258,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
@@ -5974,7 +6393,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
io_req_set_refcount(req);
- poll->events = io_poll_parse_events(sqe, flags);
+ req->cflags = poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -6091,10 +6510,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (IS_ERR(req))
return PTR_ERR(req);
-
- req_set_fail(req);
- io_fill_cqe_req(req, -ECANCELED, 0);
- io_put_req_deferred(req);
+ io_req_task_queue_fail(req, -ECANCELED);
return 0;
}
@@ -6567,6 +6983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_symlinkat_prep(req, sqe);
case IORING_OP_LINKAT:
return io_linkat_prep(req, sqe);
+ case IORING_OP_MSG_RING:
+ return io_msg_ring_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6648,7 +7066,7 @@ fail:
goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data);
+ trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
@@ -6658,7 +7076,7 @@ fail:
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED)
- io_put_kbuf(req);
+ io_put_kbuf_comp(req);
if (req->flags & REQ_F_NEED_CLEANUP) {
switch (req->opcode) {
@@ -6708,6 +7126,10 @@ static void io_clean_op(struct io_kiocb *req)
putname(req->hardlink.oldpath);
putname(req->hardlink.newpath);
break;
+ case IORING_OP_STATX:
+ if (req->statx.filename)
+ putname(req->statx.filename);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6850,6 +7272,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_LINKAT:
ret = io_linkat(req, issue_flags);
break;
+ case IORING_OP_MSG_RING:
+ ret = io_msg_ring(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
@@ -6925,7 +7350,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
continue;
}
- if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
return;
/* aborted or ready, in either case retry blocking */
needs_poll = false;
@@ -6982,7 +7407,7 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
{
struct file *file = fget(fd);
- trace_io_uring_file_get(ctx, fd);
+ trace_io_uring_file_get(ctx, req, req->user_data, fd);
/* we don't allow fixed io_uring files */
if (file && unlikely(file->f_op == &io_uring_fops))
@@ -7071,7 +7496,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
{
struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
- switch (io_arm_poll_handler(req)) {
+ switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_req_task_queue(req);
break;
@@ -7080,8 +7505,12 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
+ io_kbuf_recycle(req);
io_queue_async_work(req, NULL);
break;
+ case IO_APOLL_OK:
+ io_kbuf_recycle(req);
+ break;
}
if (linked_timeout)
@@ -7280,7 +7709,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
- trace_io_uring_req_failed(sqe, ret);
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
/* fail even hard links since we don't submit */
if (link->head) {
@@ -7307,7 +7736,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7450,8 +7879,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
}
/* will complete beyond this point, count as submitted */
submitted++;
- if (io_submit_sqe(ctx, req, sqe))
- break;
+ if (io_submit_sqe(ctx, req, sqe)) {
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
+ break;
+ }
} while (submitted < nr);
if (unlikely(submitted != nr)) {
@@ -7518,7 +7953,13 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
-
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ spin_lock(&ctx->napi_lock);
+ if (!list_empty(&ctx->napi_list) &&
+ io_napi_busy_loop(&ctx->napi_list))
+ ++ret;
+ spin_unlock(&ctx->napi_lock);
+#endif
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
@@ -7649,6 +8090,9 @@ struct io_wait_queue {
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned busy_poll_to;
+#endif
};
static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -7683,17 +8127,17 @@ static int io_run_task_work_sig(void)
{
if (io_run_task_work())
return 1;
- if (!signal_pending(current))
- return 0;
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
return -ERESTARTSYS;
- return -EINTR;
+ if (task_sigpending(current))
+ return -EINTR;
+ return 0;
}
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- signed long *timeout)
+ ktime_t timeout)
{
int ret;
@@ -7705,10 +8149,92 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
- *timeout = schedule_timeout(*timeout);
- return !*timeout ? -ETIME : 1;
+ if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
+ return -ETIME;
+ return 1;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
+ struct io_wait_queue *iowq)
+{
+ unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+ struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
+
+ if (timespec64_compare(ts, &pollto) > 0) {
+ *ts = timespec64_sub(*ts, pollto);
+ iowq->busy_poll_to = busy_poll_to;
+ } else {
+ u64 to = timespec64_to_ns(ts);
+
+ do_div(to, 1000);
+ iowq->busy_poll_to = to;
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ }
}
+static inline bool io_busy_loop_timeout(unsigned long start_time,
+ unsigned long bp_usec)
+{
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ }
+ return true;
+}
+
+static bool io_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct io_wait_queue *iowq = p;
+
+ return signal_pending(current) ||
+ io_should_wake(iowq) ||
+ io_busy_loop_timeout(start_time, iowq->busy_poll_to);
+}
+
+static void io_blocking_napi_busy_loop(struct list_head *napi_list,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time =
+ list_is_singular(napi_list) ? 0 :
+ busy_loop_current_time();
+
+ do {
+ if (list_is_singular(napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(napi_list,
+ struct napi_entry, list);
+
+ napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
+ true, BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ break;
+ }
+ } while (io_napi_busy_loop(napi_list) &&
+ !io_busy_loop_end(iowq, start_time));
+}
+
+static void io_putback_napi_list(struct io_ring_ctx *ctx,
+ struct list_head *napi_list)
+{
+ struct napi_entry *cne, *lne;
+
+ spin_lock(&ctx->napi_lock);
+ list_for_each_entry(cne, &ctx->napi_list, list)
+ list_for_each_entry(lne, napi_list, list)
+ if (cne->napi_id == lne->napi_id) {
+ list_del(&lne->list);
+ kfree(lne);
+ break;
+ }
+ list_splice(napi_list, &ctx->napi_list);
+ spin_unlock(&ctx->napi_lock);
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -7719,8 +8245,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
- signed long timeout = MAX_SCHEDULE_TIMEOUT;
+ ktime_t timeout = KTIME_MAX;
int ret;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ LIST_HEAD(local_napi_list);
+#endif
do {
io_cqring_overflow_flush(ctx);
@@ -7730,14 +8259,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7751,6 +8272,30 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ iowq.busy_poll_to = 0;
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ spin_lock(&ctx->napi_lock);
+ list_splice_init(&ctx->napi_list, &local_napi_list);
+ spin_unlock(&ctx->napi_lock);
+ }
+#endif
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (!list_empty(&local_napi_list))
+ io_adjust_busy_loop_timeout(&ts, &iowq);
+#endif
+ timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ else if (!list_empty(&local_napi_list))
+ iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+#endif
+
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -7759,6 +8304,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
trace_io_uring_cqring_wait(ctx, min_events);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (iowq.busy_poll_to)
+ io_blocking_napi_busy_loop(&local_napi_list, &iowq);
+ if (!list_empty(&local_napi_list))
+ io_putback_napi_list(ctx, &local_napi_list);
+#endif
do {
/* if we can't even flush overflow, don't wait for more */
if (!io_cqring_overflow_flush(ctx)) {
@@ -7767,7 +8318,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -7924,7 +8475,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
- break;
+ if (atomic_read(&data->refs) > 0) {
+ /*
+ * it has been revived by another thread while
+ * we were unlocked
+ */
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ break;
+ }
}
atomic_inc(&data->refs);
@@ -8739,8 +9298,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (unlikely(!tctx))
return -ENOMEM;
+ tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+ sizeof(struct file *), GFP_KERNEL);
+ if (unlikely(!tctx->registered_rings)) {
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8749,6 +9316,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8773,6 +9341,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
+ kfree(tctx->registered_rings);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
@@ -9349,33 +9918,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
return done ? done : err;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
- if (ctx->cq_ev_fd)
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
- ctx->cq_ev_fd = NULL;
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
return ret;
}
-
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}
@@ -9384,11 +9975,28 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- unsigned long index;
+ int i;
+
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
+ struct list_head *list = &ctx->io_buffers[i];
+
+ while (!list_empty(list)) {
+ struct io_buffer_list *bl;
+
+ bl = list_first_entry(list, struct io_buffer_list, list);
+ __io_remove_buffers(ctx, bl, -1U);
+ list_del(&bl->list);
+ kfree(bl);
+ }
+ }
+
+ while (!list_empty(&ctx->io_buffers_pages)) {
+ struct page *page;
- xa_for_each(&ctx->io_buffers, index, buf)
- __io_remove_buffers(ctx, buf, index, -1U);
+ page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
@@ -9419,6 +10027,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
+static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+{
+ struct async_poll *apoll;
+
+ while (!list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del(&apoll->poll.wait.entry);
+ kfree(apoll);
+ }
+}
+
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9440,8 +10060,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
- mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
+ io_flush_apoll_cache(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -9473,8 +10094,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_req_caches_free(ctx);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
+ io_free_napi_list(ctx);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
+ kfree(ctx->io_buffers);
kfree(ctx);
}
@@ -9973,6 +10596,139 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
+void io_uring_unreg_ringfd(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ int i;
+
+ for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+ if (tctx->registered_rings[i]) {
+ fput(tctx->registered_rings[i]);
+ tctx->registered_rings[i] = NULL;
+ }
+ }
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+ int start, int end)
+{
+ struct file *file;
+ int offset;
+
+ for (offset = start; offset < end; offset++) {
+ offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[offset])
+ continue;
+
+ file = fget(fd);
+ if (!file) {
+ return -EBADF;
+ } else if (file->f_op != &io_uring_fops) {
+ fput(file);
+ return -EOPNOTSUPP;
+ }
+ tctx->registered_rings[offset] = file;
+ return offset;
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_rsrc_update reg;
+ struct io_uring_task *tctx;
+ int ret, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_uring_add_tctx_node(ctx);
+ mutex_lock(&ctx->uring_lock);
+ if (ret)
+ return ret;
+
+ tctx = current->io_uring;
+ for (i = 0; i < nr_args; i++) {
+ int start, end;
+
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (reg.offset == -1U) {
+ start = 0;
+ end = IO_RINGFD_REG_MAX;
+ } else {
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+ start = reg.offset;
+ end = start + 1;
+ }
+
+ ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+ if (ret < 0)
+ break;
+
+ reg.offset = ret;
+ if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_rsrc_update reg;
+ int ret = 0, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ if (!tctx)
+ return 0;
+
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[reg.offset]) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ }
+ }
+
+ return i ? i : ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -10103,12 +10859,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+ IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ if (flags & IORING_ENTER_REGISTERED_RING) {
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || fd >= IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ f.file = tctx->registered_rings[fd];
+ if (unlikely(!f.file))
+ return -EBADF;
+ } else {
+ f = fdget(fd);
+ if (unlikely(!f.file))
+ return -EBADF;
+ }
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10182,7 +10954,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
out:
percpu_ref_put(&ctx->refs);
out_fput:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fdput(f);
return submitted ? submitted : ret;
}
@@ -10600,7 +11373,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -10950,61 +11723,6 @@ err:
return ret;
}
-static bool io_register_op_must_quiesce(int op)
-{
- switch (op) {
- case IORING_REGISTER_BUFFERS:
- case IORING_UNREGISTER_BUFFERS:
- case IORING_REGISTER_FILES:
- case IORING_UNREGISTER_FILES:
- case IORING_REGISTER_FILES_UPDATE:
- case IORING_REGISTER_PROBE:
- case IORING_REGISTER_PERSONALITY:
- case IORING_UNREGISTER_PERSONALITY:
- case IORING_REGISTER_FILES2:
- case IORING_REGISTER_FILES_UPDATE2:
- case IORING_REGISTER_BUFFERS2:
- case IORING_REGISTER_BUFFERS_UPDATE:
- case IORING_REGISTER_IOWQ_AFF:
- case IORING_UNREGISTER_IOWQ_AFF:
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- return false;
- default:
- return true;
- }
-}
-
-static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
-{
- long ret;
-
- percpu_ref_kill(&ctx->refs);
-
- /*
- * Drop uring mutex before waiting for references to exit. If another
- * thread is currently inside io_uring_enter() it might need to grab the
- * uring_lock to make progress. If we hold it here across the drain
- * wait, then we can deadlock. It's safe to drop the mutex here, since
- * no new references will come in after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- do {
- ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
- if (ret) {
- ret = min(0L, ret);
- break;
- }
-
- ret = io_run_task_work_sig();
- io_req_caches_free(ctx);
- } while (ret >= 0);
- mutex_lock(&ctx->uring_lock);
-
- if (ret)
- io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
- return ret;
-}
-
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -11028,12 +11746,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -EACCES;
}
- if (io_register_op_must_quiesce(opcode)) {
- ret = io_ctx_quiesce(ctx);
- if (ret)
- return ret;
- }
-
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
@@ -11057,17 +11769,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
- case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
- ret = io_eventfd_register(ctx, arg);
- if (ret)
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
break;
- if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
- ctx->eventfd_async = 1;
- else
- ctx->eventfd_async = 0;
+ ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
@@ -11134,16 +11845,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
- if (io_register_op_must_quiesce(opcode)) {
- /* bring the ctx back to life */
- percpu_ref_reinit(&ctx->refs);
- reinit_completion(&ctx->ref_comp);
- }
return ret;
}
@@ -11169,8 +11881,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
return ret;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 6c51a75d0be6..4653f3d07a1d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -292,19 +292,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
if (ctx->rac) /* same as readahead_gfp_mask */
gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
* what do_mpage_readpage does.
*/
- if (!ctx->bio)
- ctx->bio = bio_alloc(orig_gfp, 1);
- ctx->bio->bi_opf = REQ_OP_READ;
+ if (!ctx->bio) {
+ ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+ orig_gfp);
+ }
if (ctx->rac)
ctx->bio->bi_opf |= REQ_RAHEAD;
ctx->bio->bi_iter.bi_sector = sector;
- bio_set_dev(ctx->bio, iomap->bdev);
ctx->bio->bi_end_io = iomap_read_end_io;
bio_add_folio(ctx->bio, folio, plen, poff);
}
@@ -550,10 +551,8 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
struct bio_vec bvec;
struct bio bio;
- bio_init(&bio, &bvec, 1);
- bio.bi_opf = REQ_OP_READ;
+ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_set_dev(&bio, iomap->bdev);
bio_add_folio(&bio, folio, plen, poff);
return submit_bio_wait(&bio);
}
@@ -1229,10 +1228,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
struct iomap_ioend *ioend;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
- bio_set_dev(bio, wpc->iomap.bdev);
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
@@ -1261,10 +1260,9 @@ iomap_chain_bio(struct bio *prev)
{
struct bio *new;
- new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
+ bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 03ea367df19a..67cf9c16f80c 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/fscrypt.h>
#include <linux/pagemap.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
@@ -179,19 +180,20 @@ static void iomap_dio_bio_end_io(struct bio *bio)
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, iter->iomap.bdev);
+ bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
__bio_add_page(bio, page, len, 0);
- bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
@@ -309,14 +311,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(GFP_KERNEL, nr_pages);
- bio_set_dev(bio, iomap->bdev);
+ bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- bio->bi_opf = bio_opf;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 78fd136ac13b..997c81fcea34 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1980,17 +1980,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(GFP_NOFS, 1);
-
+ bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_READ;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
@@ -2125,16 +2121,13 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO");
- bio = bio_alloc(GFP_NOFS, 1);
+ bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
- bio_set_dev(bio, log->bdev);
-
bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 104ae698443e..fde1a9cf902e 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -417,12 +417,10 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
}
len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS);
bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_write_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
/* Don't call bio_add_page yet, we may add to this vec */
bio_offset = offset;
@@ -497,13 +495,12 @@ static int metapage_readpage(struct file *fp, struct page *page)
if (bio)
submit_bio(bio);
- bio = bio_alloc(GFP_NOFS, 1);
- bio_set_dev(bio, inode->i_sb->s_bdev);
+ bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ,
+ GFP_NOFS);
bio->bi_iter.bi_sector =
pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_read_end_io;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
len = xlen << inode->i_blkbits;
offset = block_offset << inode->i_blkbits;
if (bio_add_page(bio, page, len, offset) < len)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 19d36393974c..9cebb6ba555b 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -11,7 +11,6 @@
#include <linux/writeback.h>
#include <linux/xattr.h>
#include <linux/falloc.h>
-#include <linux/genhd.h>
#include <linux/fsnotify.h>
#include <linux/dcache.h>
#include <linux/slab.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0475c5a5d061..59ef8a1f843f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -184,8 +184,7 @@ lockd(void *vrqstp)
dprintk("lockd_down: service stopped\n");
svc_exit_thread(rqstp);
-
- module_put_and_kthread_exit(0);
+ return 0;
}
static int create_lockd_listener(struct svc_serv *serv, const char *name,
@@ -197,8 +196,8 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
xprt = svc_find_xprt(serv, name, net, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, net, family, port,
- SVC_SOCK_DEFAULTS, cred);
+ return svc_xprt_create(serv, name, net, family, port,
+ SVC_SOCK_DEFAULTS, cred);
svc_xprt_put(xprt);
return 0;
}
@@ -248,7 +247,8 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
return err;
}
@@ -286,9 +286,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
- svc_shutdown_net(serv, net);
- dprintk("%s: per-net data destroyed; net=%x\n",
- __func__, net->ns.inum);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
}
} else {
pr_err("%s: no users! net=%x\n",
@@ -350,13 +349,6 @@ static struct notifier_block lockd_inet6addr_notifier = {
};
#endif
-static const struct svc_serv_ops lockd_sv_ops = {
- .svo_shutdown = svc_rpcb_cleanup,
- .svo_function = lockd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
static int lockd_get(void)
{
struct svc_serv *serv;
@@ -380,7 +372,7 @@ static int lockd_get(void)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
return -ENOMEM;
diff --git a/fs/mpage.c b/fs/mpage.c
index 87f5cfef6caa..6c4b810a21d0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -57,38 +57,14 @@ static void mpage_end_io(struct bio *bio)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
+static struct bio *mpage_bio_submit(struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
- bio_set_op_attrs(bio, op, op_flags);
guard_bio_eod(bio);
submit_bio(bio);
return NULL;
}
-static struct bio *
-mpage_alloc(struct block_device *bdev,
- sector_t first_sector, int nr_vecs,
- gfp_t gfp_flags)
-{
- struct bio *bio;
-
- /* Restrict the given (page cache) mask for slab allocations */
- gfp_flags &= GFP_KERNEL;
- bio = bio_alloc(gfp_flags, nr_vecs);
-
- if (bio == NULL && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(gfp_flags, nr_vecs);
- }
-
- if (bio) {
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = first_sector;
- }
- return bio;
-}
-
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
@@ -169,16 +145,15 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
- int op_flags;
+ int op = REQ_OP_READ;
unsigned nblocks;
unsigned relative_block;
gfp_t gfp;
if (args->is_readahead) {
- op_flags = REQ_RAHEAD;
+ op |= REQ_RAHEAD;
gfp = readahead_gfp_mask(page->mapping);
} else {
- op_flags = 0;
gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
}
@@ -287,7 +262,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
alloc_new:
if (args->bio == NULL) {
@@ -296,15 +271,16 @@ alloc_new:
page))
goto out;
}
- args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- bio_max_segs(args->nr_pages), gfp);
+ args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op,
+ gfp);
if (args->bio == NULL)
goto confused;
+ args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
}
length = first_hole << blkbits;
if (bio_add_page(args->bio, page, length, 0) < length) {
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
goto alloc_new;
}
@@ -312,7 +288,7 @@ alloc_new:
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
else
args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
@@ -320,7 +296,7 @@ out:
confused:
if (args->bio)
- args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
+ args->bio = mpage_bio_submit(args->bio);
if (!PageUptodate(page))
block_read_full_page(page, args->get_block);
else
@@ -383,7 +359,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
put_page(page);
}
if (args.bio)
- mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
+ mpage_bio_submit(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);
@@ -400,7 +376,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
args.bio = do_mpage_readpage(&args);
if (args.bio)
- mpage_bio_submit(REQ_OP_READ, 0, args.bio);
+ mpage_bio_submit(args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
@@ -491,7 +467,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -599,7 +574,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
alloc_new:
if (bio == NULL) {
@@ -608,11 +583,10 @@ alloc_new:
page, wbc))
goto out;
}
- bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
- if (bio == NULL)
- goto confused;
-
+ bio = bio_alloc(bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS);
+ bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
}
@@ -625,7 +599,7 @@ alloc_new:
wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
goto alloc_new;
}
@@ -635,7 +609,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -647,7 +621,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
+ bio = mpage_bio_submit(bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -703,11 +677,8 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio) {
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0);
- mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
- }
+ if (mpd.bio)
+ mpage_bio_submit(mpd.bio);
}
blk_finish_plug(&plug);
return ret;
@@ -724,11 +695,8 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio) {
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- REQ_SYNC : 0);
- mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
- }
+ if (mpd.bio)
+ mpage_bio_submit(mpd.bio);
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index fe860c538747..79a8b451791f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,23 +115,6 @@ bl_submit_bio(struct bio *bio)
return NULL;
}
-static struct bio *bl_alloc_init_bio(unsigned int npg,
- struct block_device *bdev, sector_t disk_sector,
- bio_end_io_t end_io, struct parallel_io *par)
-{
- struct bio *bio;
-
- npg = bio_max_segs(npg);
- bio = bio_alloc(GFP_NOIO, npg);
- if (bio) {
- bio->bi_iter.bi_sector = disk_sector;
- bio_set_dev(bio, bdev);
- bio->bi_end_io = end_io;
- bio->bi_private = par;
- }
- return bio;
-}
-
static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
{
return offset >= map->start && offset < map->start + map->len;
@@ -171,11 +154,10 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, map->bdev,
- disk_addr >> SECTOR_SHIFT, end_io, par);
- if (!bio)
- return ERR_PTR(-ENOMEM);
- bio_set_op_attrs(bio, rw, 0);
+ bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO);
+ bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
}
if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(bio);
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index ef9db135c649..6c977288cc28 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -27,7 +27,6 @@
*/
#include <linux/module.h>
-#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "blocklayout.h"
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 054cc1255fac..456af7d230cf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,7 +17,6 @@
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/freezer.h>
-#include <linux/kthread.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/bc_xprt.h>
@@ -45,18 +44,18 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
int ret;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret <= 0)
goto out_err;
nn->nfs_callback_tcpport = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret > 0) {
nn->nfs_callback_tcpport6 = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
@@ -92,8 +91,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
+
svc_exit_thread(rqstp);
- module_put_and_kthread_exit(0);
return 0;
}
@@ -136,8 +135,8 @@ nfs41_callback_svc(void *vrqstp)
finish_wait(&serv->sv_cb_waitq, &wq);
}
}
+
svc_exit_thread(rqstp);
- module_put_and_kthread_exit(0);
return 0;
}
@@ -189,7 +188,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -232,33 +231,10 @@ err_bind:
return ret;
}
-static const struct svc_serv_ops nfs40_cb_sv_ops = {
- .svo_function = nfs4_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-#if defined(CONFIG_NFS_V4_1)
-static const struct svc_serv_ops nfs41_cb_sv_ops = {
- .svo_function = nfs41_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = &nfs41_cb_sv_ops,
-};
-#else
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = NULL,
-};
-#endif
-
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
- const struct svc_serv_ops *sv_ops;
+ int (*threadfn)(void *data);
struct svc_serv *serv;
/*
@@ -267,17 +243,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
if (cb_info->serv)
return svc_get(cb_info->serv);
- switch (minorversion) {
- case 0:
- sv_ops = nfs4_cb_sv_ops[0];
- break;
- default:
- sv_ops = nfs4_cb_sv_ops[1];
- }
-
- if (sv_ops == NULL)
- return ERR_PTR(-ENOTSUPP);
-
/*
* Sanity check: if there's no task,
* we should be the first user ...
@@ -286,7 +251,16 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ threadfn = nfs4_callback_svc;
+#if defined(CONFIG_NFS_V4_1)
+ if (minorversion)
+ threadfn = nfs41_callback_svc;
+#else
+ if (minorversion)
+ return ERR_PTR(-ENOTSUPP);
+#endif
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ threadfn);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f5a62c0d999b..02a899e4390f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2697,6 +2697,5 @@ static int nfs4_run_state_manager(void *ptr)
allow_signal(SIGKILL);
nfs4_state_manager(clp);
nfs_put_client(clp);
- module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 3d1d17256a91..f6a2fd3015e7 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -35,18 +35,9 @@ config NFSD_V2_ACL
bool
depends on NFSD
-config NFSD_V3
- bool "NFS server support for NFS version 3"
- depends on NFSD
- help
- This option enables support in your system's NFS server for
- version 3 of the NFS protocol (RFC 1813).
-
- If unsure, say Y.
-
config NFSD_V3_ACL
bool "NFS server support for the NFSv3 ACL protocol extension"
- depends on NFSD_V3
+ depends on NFSD
select NFSD_V2_ACL
help
Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
@@ -70,7 +61,6 @@ config NFSD_V3_ACL
config NFSD_V4
bool "NFS server support for NFS version 4"
depends on NFSD && PROC_FS
- select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 3f0983e93a99..805c06d5f1b4 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,9 +12,8 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o \
- stats.o filecache.o
+ stats.o filecache.o nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
-nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index e5c0982a381d..b6d01d51a746 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -4,7 +4,6 @@
*/
#include <linux/exportfs.h>
#include <linux/iomap.h>
-#include <linux/genhd.h>
#include <linux/slab.h>
#include <linux/pr.h>
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 8bc807c5fea4..cc2831cec669 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -632,7 +632,7 @@ nfsd_file_cache_init(void)
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
if (!nfsd_file_hashtbl) {
pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
@@ -700,7 +700,7 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
@@ -811,7 +811,7 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index 2e2f1d5e9f62..070f90ed09b6 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -117,7 +117,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
da->netaddr.addr_len =
snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
- "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+ "%s.%d.%d", addr, port >> 8, port & 0xff);
da->tightly_coupled = false;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 6d1b5bb051c5..2c05692a9abf 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -422,7 +422,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
if (!new)
return nfserr_jukebox;
- memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ memcpy(&new->lo_seg, seg, sizeof(new->lo_seg));
new->lo_state = ls;
spin_lock(&fp->fi_lock);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 32063733443d..234e852fcdfa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4711,6 +4711,14 @@ nfsd_break_deleg_cb(struct file_lock *fl)
return ret;
}
+/**
+ * nfsd_breaker_owns_lease - Check if lease conflict was resolved
+ * @fl: Lock state to check
+ *
+ * Return values:
+ * %true: Lease conflict was resolved
+ * %false: Lease conflict was not resolved.
+ */
static bool nfsd_breaker_owns_lease(struct file_lock *fl)
{
struct nfs4_delegation *dl = fl->fl_owner;
@@ -4718,11 +4726,11 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
struct nfs4_client *clp;
if (!i_am_nfsd())
- return NULL;
+ return false;
rqst = kthread_data(current);
/* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
- return NULL;
+ return false;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
@@ -6526,7 +6534,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
}
static fl_owner_t
-nfsd4_fl_get_owner(fl_owner_t owner)
+nfsd4_lm_get_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6535,7 +6543,7 @@ nfsd4_fl_get_owner(fl_owner_t owner)
}
static void
-nfsd4_fl_put_owner(fl_owner_t owner)
+nfsd4_lm_put_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6570,8 +6578,8 @@ nfsd4_lm_notify(struct file_lock *fl)
static const struct lock_manager_operations nfsd_posix_mng_ops = {
.lm_notify = nfsd4_lm_notify,
- .lm_get_owner = nfsd4_fl_get_owner,
- .lm_put_owner = nfsd4_fl_put_owner,
+ .lm_get_owner = nfsd4_lm_get_owner,
+ .lm_put_owner = nfsd4_lm_put_owner,
};
static inline void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 714a3a3bd50c..da92e7d2ab6a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2854,6 +2854,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
+ if (!(stat.result_mask & STATX_BTIME))
+ /* underlying FS does not offer btime so we can't share it */
+ bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
@@ -3254,6 +3257,13 @@ out_acl:
p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
*p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
+ if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
+ *p++ = cpu_to_be32(stat.btime.tv_nsec);
+ }
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
struct kstat parent_stat;
u64 ino = stat.ino;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index a4a69ab6ab28..0b3f12aa37ff 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -84,12 +84,6 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
-static u32
-nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
-{
- return hash_32((__force u32)xid, nn->maskbits);
-}
-
static struct svc_cacherep *
nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
struct nfsd_net *nn)
@@ -241,6 +235,14 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
list_move_tail(&rp->c_lru, &b->lru_head);
}
+static noinline struct nfsd_drc_bucket *
+nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
+{
+ unsigned int hash = hash_32((__force u32)xid, nn->maskbits);
+
+ return &nn->drc_hashtbl[hash];
+}
+
static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
unsigned int max)
{
@@ -419,12 +421,10 @@ out:
*/
int nfsd_cache_lookup(struct svc_rqst *rqstp)
{
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfsd_net *nn;
struct svc_cacherep *rp, *found;
- __be32 xid = rqstp->rq_xid;
__wsum csum;
- u32 hash = nfsd_cache_hash(xid, nn);
- struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
+ struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -440,17 +440,16 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
+ nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
if (!rp)
goto out;
+ b = nfsd_cache_bucket_find(rqstp->rq_xid, nn);
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp, nn);
- if (found != rp) {
- nfsd_reply_cache_free_locked(NULL, rp, nn);
- rp = found;
+ if (found != rp)
goto found_entry;
- }
nfsd_stats_rc_misses_inc();
rqstp->rq_cacherep = rp;
@@ -468,8 +467,10 @@ out:
found_entry:
/* We found a matching entry which is either in progress or done. */
+ nfsd_reply_cache_free_locked(NULL, rp, nn);
nfsd_stats_rc_hits_inc();
rtn = RC_DROPIT;
+ rp = found;
/* Request being processed */
if (rp->c_state == RC_INPROG)
@@ -528,7 +529,6 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
- u32 hash;
struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;
@@ -536,8 +536,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;
- hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
- b = &nn->drc_hashtbl[hash];
+ b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn);
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 68b020f2002b..16920e4512bd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -772,13 +772,13 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (err != 0)
return err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
@@ -790,7 +790,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
out_close:
xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
- svc_close_xprt(xprt);
+ svc_xprt_close(xprt);
svc_xprt_put(xprt);
}
out_err:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 3e5008b475ff..4fc1fd639527 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -364,7 +364,7 @@ void nfsd_lockd_shutdown(void);
| FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
| FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
| FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
- | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \
| FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
#define NFSD4_SUPPORTED_ATTRS_WORD2 0
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 145208bcb9bd..c29baa03dfaf 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -611,8 +611,6 @@ out_negative:
return nfserr_serverfault;
}
-#ifdef CONFIG_NFSD_V3
-
/**
* fh_fill_pre_attrs - Fill in pre-op attributes
* @fhp: file handle to be updated
@@ -673,8 +671,6 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
}
-#endif /* CONFIG_NFSD_V3 */
-
/*
* Release a file handle.
*/
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 434930d8a946..fb9d358a267e 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -90,7 +90,6 @@ typedef struct svc_fh {
* operation
*/
int fh_flags; /* FH flags */
-#ifdef CONFIG_NFSD_V3
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
@@ -107,7 +106,6 @@ typedef struct svc_fh {
/* Post-op attributes saved in fh_unlock */
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
-#endif /* CONFIG_NFSD_V3 */
} svc_fh;
#define NFSD4_FH_FOREIGN (1<<0)
#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
@@ -283,8 +281,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
}
#endif
-#ifdef CONFIG_NFSD_V3
-
/**
* fh_clear_pre_post_attrs - Reset pre/post attributes
* @fhp: file handle to be updated
@@ -327,22 +323,6 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat,
extern void fh_fill_pre_attrs(struct svc_fh *fhp);
extern void fh_fill_post_attrs(struct svc_fh *fhp);
-#else /* !CONFIG_NFSD_V3 */
-
-static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
-{
-}
-
-static inline void fh_fill_pre_attrs(struct svc_fh *fhp)
-{
-}
-
-static inline void fh_fill_post_attrs(struct svc_fh *fhp)
-{
-}
-
-#endif /* !CONFIG_NFSD_V3 */
-
/*
* Lock a file handle/inode
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 18b8eb43a19b..fcdab8a8a41f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
unsigned long cnt = argp->len;
unsigned int nvecs;
- dprintk("nfsd: WRITE %s %d bytes at %d\n",
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b8c682b62d29..4bb5baa17040 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -117,9 +117,7 @@ static struct svc_stat nfsd_acl_svcstats = {
static const struct svc_version *nfsd_version[] = {
[2] = &nfsd_version2,
-#if defined(CONFIG_NFSD_V3)
[3] = &nfsd_version3,
-#endif
#if defined(CONFIG_NFSD_V4)
[4] = &nfsd_version4,
#endif
@@ -293,13 +291,13 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
if (!list_empty(&nn->nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
- error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
@@ -612,13 +610,6 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
-static const struct svc_serv_ops nfsd_thread_sv_ops = {
- .svo_shutdown = nfsd_last_thread,
- .svo_function = nfsd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
void nfsd_shutdown_threads(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -657,8 +648,7 @@ int nfsd_create_serv(struct net *net)
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
- serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- &nfsd_thread_sv_ops);
+ serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
if (serv == NULL)
return -ENOMEM;
@@ -724,7 +714,8 @@ void nfsd_put(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) {
- svc_shutdown_net(nn->nfsd_serv, net);
+ svc_xprt_destroy_all(nn->nfsd_serv, net);
+ nfsd_last_thread(nn->nfsd_serv, net);
svc_destroy(&nn->nfsd_serv->sv_refcnt);
spin_lock(&nfsd_notifier_lock);
nn->nfsd_serv = NULL;
@@ -1019,8 +1010,6 @@ out:
msleep(20);
}
- /* Release module */
- module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 5889db66409d..242fa123e0e9 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -13,22 +13,6 @@
#include "export.h"
#include "nfsfh.h"
-#define NFSD_TRACE_PROC_ARG_FIELDS \
- __field(unsigned int, netns_ino) \
- __field(u32, xid) \
- __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
- __array(unsigned char, client, sizeof(struct sockaddr_in6))
-
-#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \
- do { \
- __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
- __entry->xid = be32_to_cpu(rqstp->rq_xid); \
- memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
- rqstp->rq_xprt->xpt_locallen); \
- memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
- rqstp->rq_xprt->xpt_remotelen); \
- } while (0);
-
#define NFSD_TRACE_PROC_RES_FIELDS \
__field(unsigned int, netns_ino) \
__field(u32, xid) \
@@ -53,16 +37,22 @@ DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
),
TP_ARGS(rqstp),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_ARG_FIELDS
-
+ __field(unsigned int, netns_ino)
+ __field(u32, xid)
__field(u32, vers)
__field(u32, proc)
+ __sockaddr(server, rqstp->rq_xprt->xpt_locallen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
- NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+ const struct svc_xprt *xprt = rqstp->rq_xprt;
+ __entry->netns_ino = xprt->xpt_net->ns.inum;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
__entry->vers = rqstp->rq_vers;
__entry->proc = rqstp->rq_proc;
+ __assign_sockaddr(server, &xprt->xpt_local, xprt->xpt_locallen);
+ __assign_sockaddr(client, &xprt->xpt_remote, xprt->xpt_remotelen);
),
TP_printk("xid=0x%08x vers=%u proc=%u",
__entry->xid, __entry->vers, __entry->proc
@@ -613,20 +603,21 @@ TRACE_EVENT(nfsd_clid_cred_mismatch,
__field(u32, cl_id)
__field(unsigned long, cl_flavor)
__field(unsigned long, new_flavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->cl_flavor = clp->cl_cred.cr_flavor;
__entry->new_flavor = rqstp->rq_cred.cr_flavor;
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
show_nfsd_authflavor(__entry->cl_flavor),
- show_nfsd_authflavor(__entry->new_flavor), __entry->addr
+ show_nfsd_authflavor(__entry->new_flavor),
+ __get_sockaddr(addr)
)
)
@@ -642,7 +633,7 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
__field(u32, cl_id)
__array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE)
__array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -651,14 +642,14 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
NFS4_VERIFIER_SIZE);
memcpy(__entry->new_verifier, (void *)verf,
NFS4_VERIFIER_SIZE);
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
__print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE),
__print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE),
- __entry->addr
+ __get_sockaddr(addr)
)
);
@@ -908,18 +899,17 @@ TRACE_EVENT(nfsd_cb_args,
__field(u32, cl_id)
__field(u32, prog)
__field(u32, ident)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, conn->cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->prog = conn->cb_prog;
__entry->ident = conn->cb_ident;
- memcpy(__entry->addr, &conn->cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &conn->cb_addr, conn->cb_addrlen);
),
TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->prog, __entry->ident)
);
@@ -951,17 +941,17 @@ DECLARE_EVENT_CLASS(nfsd_cb_class,
__field(unsigned long, state)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->state = clp->cl_cb_state;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x state=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
show_cb_state(__entry->state))
);
@@ -1001,7 +991,7 @@ TRACE_EVENT(nfsd_cb_setup,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(unsigned long, authflavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
__array(unsigned char, netid, 8)
),
TP_fast_assign(
@@ -1009,11 +999,11 @@ TRACE_EVENT(nfsd_cb_setup,
__entry->cl_id = clp->cl_clientid.cl_id;
strlcpy(__entry->netid, netid, sizeof(__entry->netid));
__entry->authflavor = authflavor;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->netid, show_nfsd_authflavor(__entry->authflavor))
);
@@ -1027,30 +1017,32 @@ TRACE_EVENT(nfsd_cb_setup_err,
__field(long, error)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->error = error;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x error=%ld",
- __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->error)
);
-TRACE_EVENT(nfsd_cb_recall,
+TRACE_EVENT_CONDITION(nfsd_cb_recall,
TP_PROTO(
const struct nfs4_stid *stid
),
TP_ARGS(stid),
+ TP_CONDITION(stid->sc_client),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, si_id)
__field(u32, si_generation)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, stid->sc_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const stateid_t *stp = &stid->sc_stateid;
@@ -1060,14 +1052,11 @@ TRACE_EVENT(nfsd_cb_recall,
__entry->cl_id = stp->si_opaque.so_clid.cl_id;
__entry->si_id = stp->si_opaque.so_id;
__entry->si_generation = stp->si_generation;
- if (clp)
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
- else
- memset(__entry->addr, 0, sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation)
);
@@ -1081,7 +1070,7 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, fh_hash)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, lo->lo_owner.so_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const struct nfs4_client *clp = lo->lo_owner.so_client;
@@ -1089,11 +1078,11 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh);
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->fh_hash)
);
@@ -1114,7 +1103,7 @@ TRACE_EVENT(nfsd_cb_offload,
__field(u32, fh_hash)
__field(int, status)
__field(u64, count)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
@@ -1124,11 +1113,11 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash = knfsd_fh_hash(fh);
__entry->status = be32_to_cpu(status);
__entry->count = count;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation,
__entry->fh_hash, __entry->count, __entry->status)
);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 91600e71be19..166eb0ba3e71 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -32,9 +32,7 @@
#include <linux/writeback.h>
#include <linux/security.h>
-#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
-#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
#include "../internal.h"
@@ -608,7 +606,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif /* defined(CONFIG_NFSD_V4) */
-#ifdef CONFIG_NFSD_V3
/*
* Check server access rights to a file system object
*/
@@ -720,7 +717,6 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
out:
return error;
}
-#endif /* CONFIG_NFSD_V3 */
int nfsd_open_break_lease(struct inode *inode, int access)
{
@@ -1113,7 +1109,6 @@ out:
return err;
}
-#ifdef CONFIG_NFSD_V3
/**
* nfsd_commit - Commit pending writes to stable storage
* @rqstp: RPC request being processed
@@ -1190,7 +1185,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
out:
return err;
}
-#endif /* CONFIG_NFSD_V3 */
static __be32
nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
@@ -1380,8 +1374,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
rdev, resfhp);
}
-#ifdef CONFIG_NFSD_V3
-
/*
* NFSv3 and NFSv4 version of nfsd_create
*/
@@ -1547,7 +1539,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(host_err);
goto out;
}
-#endif /* CONFIG_NFSD_V3 */
/*
* Read a symlink. On entry, *lenp must contain the maximum path length that
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2c43d10e3cab..ccb87b2864f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -68,7 +68,6 @@ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
-#ifdef CONFIG_NFSD_V3
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -76,7 +75,6 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
u32 *verifier, bool *truncp, bool *created);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
u64 offset, u32 count, __be32 *verf);
-#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 528fb299430e..852f71580bd0 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -32,7 +32,7 @@ struct nfsd_readargs {
struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
- int len;
+ __u32 len;
struct xdr_buf payload;
};
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 43287b0d3e9b..a3bb0c856ec8 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -337,8 +337,7 @@ static void nilfs_end_bio_write(struct bio *bio)
}
static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
- struct nilfs_write_info *wi, int mode,
- int mode_flags)
+ struct nilfs_write_info *wi)
{
struct bio *bio = wi->bio;
int err;
@@ -356,7 +355,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- bio_set_op_attrs(bio, mode, mode_flags);
submit_bio(bio);
segbuf->sb_nbio++;
@@ -371,29 +369,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
return err;
}
-/**
- * nilfs_alloc_seg_bio - allocate a new bio for writing log
- * @nilfs: nilfs object
- * @start: start block number of the bio
- * @nr_vecs: request size of page vector.
- *
- * Return Value: On success, pointer to the struct bio is returned.
- * On error, NULL is returned.
- */
-static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
- int nr_vecs)
-{
- struct bio *bio;
-
- bio = bio_alloc(GFP_NOIO, nr_vecs);
- if (likely(bio)) {
- bio_set_dev(bio, nilfs->ns_bdev);
- bio->bi_iter.bi_sector =
- start << (nilfs->ns_blocksize_bits - 9);
- }
- return bio;
-}
-
static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
struct nilfs_write_info *wi)
{
@@ -407,17 +382,17 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
struct nilfs_write_info *wi,
- struct buffer_head *bh, int mode)
+ struct buffer_head *bh)
{
int len, err;
BUG_ON(wi->nr_vecs <= 0);
repeat:
if (!wi->bio) {
- wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
- wi->nr_vecs);
- if (unlikely(!wi->bio))
- return -ENOMEM;
+ wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs,
+ REQ_OP_WRITE, GFP_NOIO);
+ wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) <<
+ (wi->nilfs->ns_blocksize_bits - 9);
}
len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -426,7 +401,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
return 0;
}
/* bio is FULL */
- err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0);
+ err = nilfs_segbuf_submit_bio(segbuf, wi);
/* never submit current bh */
if (likely(!err))
goto repeat;
@@ -456,13 +431,13 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
nilfs_segbuf_prepare_write(segbuf, &wi);
list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh);
if (unlikely(res))
goto failed_bio;
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
- res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE);
+ res = nilfs_segbuf_submit_bh(segbuf, &wi, bh);
if (unlikely(res))
goto failed_bio;
}
@@ -472,8 +447,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
* Last BIO is always sent through the following
* submission.
*/
- res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE,
- REQ_SYNC);
+ wi.bio->bi_opf |= REQ_SYNC;
+ res = nilfs_segbuf_submit_bio(segbuf, &wi);
}
failed_bio:
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4de9acb16968..3de5700a9b83 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1443,17 +1443,6 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
return err;
}
-static inline struct bio *ntfs_alloc_bio(u32 nr_vecs)
-{
- struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
-
- if (!bio && (current->flags & PF_MEMALLOC)) {
- while (!bio && (nr_vecs /= 2))
- bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs);
- }
- return bio;
-}
-
/*
* ntfs_bio_pages - Read/write pages from/to disk.
*/
@@ -1496,19 +1485,13 @@ int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
lbo = ((u64)lcn << cluster_bits) + off;
len = ((u64)clen << cluster_bits) - off;
new_bio:
- new = ntfs_alloc_bio(nr_pages - page_idx);
- if (!new) {
- err = -ENOMEM;
- goto out;
- }
+ new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = lbo >> 9;
- bio->bi_opf = op;
while (len) {
off = vbo & (PAGE_SIZE - 1);
@@ -1599,18 +1582,12 @@ int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
lbo = (u64)lcn << cluster_bits;
len = (u64)clen << cluster_bits;
new_bio:
- new = ntfs_alloc_bio(BIO_MAX_VECS);
- if (!new) {
- err = -ENOMEM;
- break;
- }
+ new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
bio = new;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = lbo >> 9;
for (;;) {
@@ -1626,11 +1603,10 @@ new_bio:
}
} while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));
- if (bio) {
- if (!err)
- err = submit_bio_wait(bio);
- bio_put(bio);
- }
+ if (!err)
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+
blk_finish_plug(&plug);
out:
unlock_page(fill);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a17be1618bf7..ea0e70c0fce0 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -518,7 +518,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, 16);
+ bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -527,10 +527,8 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
/* Must put everything in 512 byte sectors for the bio... */
bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
- bio_set_dev(bio, reg->hr_bdev);
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- bio_set_op_attrs(bio, op, op_flags);
vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2772dec9dcea..8bde30fa5387 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1105,17 +1105,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- root = d_make_root(inode);
- if (!root) {
- status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
- }
-
- sb->s_root = root;
-
- ocfs2_complete_mount_recovery(osb);
-
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
&ocfs2_kset->kobj);
if (!osb->osb_dev_kset) {
@@ -1133,6 +1122,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
+ root = d_make_root(inode);
+ if (!root) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto read_super_error;
+ }
+
+ sb->s_root = root;
+
+ ocfs2_complete_mount_recovery(osb);
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
diff --git a/fs/pipe.c b/fs/pipe.c
index cc28623a67b6..2667db9506e2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
*/
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) {
- unsigned int head = pipe->head;
+ /* Read ->head with a barrier vs post_one_notification() */
+ unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
@@ -831,10 +832,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
int i;
#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe->watch_queue)
watch_queue_clear(pipe->watch_queue);
- put_watch_queue(pipe->watch_queue);
- }
#endif
(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
@@ -844,6 +843,10 @@ void free_pipe_info(struct pipe_inode_info *pipe)
if (buf->ops)
pipe_buf_release(pipe, buf);
}
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue)
+ put_watch_queue(pipe->watch_queue);
+#endif
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
kfree(pipe->bufs);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6e97ed775074..f46060eb91b5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
- const char *anon_name;
+ struct anon_vma_name *anon_name;
if (!mm) {
name = "[vdso]";
@@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- anon_name = vma_anon_name(vma);
+ anon_name = anon_vma_name(vma);
if (anon_name) {
seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name);
+ seq_printf(m, "[anon:%s]", anon_name->name);
}
}
@@ -1597,7 +1597,8 @@ static const struct mm_walk_ops pagemap_ops = {
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bits 57-60 zero
+ * Bit 57 pte is uffd-wp write-protected
+ * Bits 58-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f243cb5e6a4f..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index fe5305028c6e..a89e33719fcf 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -263,10 +263,10 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
if (prz->corrected_bytes || prz->bad_blocks)
ret = snprintf(str, len, ""
- "\n%d Corrected bytes, %d unrecoverable blocks\n",
+ "\nECC: %d Corrected bytes, %d unrecoverable blocks\n",
prz->corrected_bytes, prz->bad_blocks);
else
- ret = snprintf(str, len, "\nNo errors detected\n");
+ ret = snprintf(str, len, "\nECC: No errors detected\n");
return ret;
}
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2db8bcf7ff85..622c844f6d11 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -86,16 +86,17 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS)
- bio = bio_alloc(GFP_NOIO, page_count);
- else
+ if (page_count <= BIO_MAX_VECS) {
+ bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
+ } else {
bio = bio_kmalloc(GFP_NOIO, page_count);
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_opf = REQ_OP_READ;
+ }
if (!bio)
return -ENOMEM;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = READ;
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
diff --git a/fs/stat.c b/fs/stat.c
index 28d2020ba1f4..7f734be0e57e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -184,6 +184,20 @@ int vfs_fstat(int fd, struct kstat *stat)
return error;
}
+int getname_statx_lookup_flags(int flags)
+{
+ int lookup_flags = 0;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+ if (!(flags & AT_NO_AUTOMOUNT))
+ lookup_flags |= LOOKUP_AUTOMOUNT;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ return lookup_flags;
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -199,26 +213,19 @@ int vfs_fstat(int fd, struct kstat *stat)
*
* 0 will be returned on success, and a -ve error code if unsuccessful.
*/
-static int vfs_statx(int dfd, const char __user *filename, int flags,
+static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned lookup_flags = 0;
+ unsigned int lookup_flags = getname_statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
AT_STATX_SYNC_TYPE))
return -EINVAL;
- if (!(flags & AT_SYMLINK_NOFOLLOW))
- lookup_flags |= LOOKUP_FOLLOW;
- if (!(flags & AT_NO_AUTOMOUNT))
- lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
retry:
- error = user_path_at(dfd, filename, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
@@ -240,8 +247,15 @@ out:
int vfs_fstatat(int dfd, const char __user *filename,
struct kstat *stat, int flags)
{
- return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- stat, STATX_BASIC_STATS);
+ int ret;
+ int statx_flags = flags | AT_NO_AUTOMOUNT;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+ ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
+ putname(name);
+
+ return ret;
}
#ifdef __ARCH_WANT_OLD_STAT
@@ -602,7 +616,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer)
{
struct kstat stat;
@@ -636,7 +650,14 @@ SYSCALL_DEFINE5(statx,
unsigned int, mask,
struct statx __user *, buffer)
{
- return do_statx(dfd, filename, flags, mask, buffer);
+ int ret;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+ ret = do_statx(dfd, name, flags, mask, buffer);
+ putname(name);
+
+ return ret;
}
#ifdef CONFIG_COMPAT
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index bafc02bf8220..de7252715b12 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
if (!gid_valid(gid))
return -EINVAL;
opts->gid = gid;
- set_gid(tracefs_mount->mnt_root, gid);
break;
case Opt_mode:
if (match_octal(&args[0], &option))
@@ -291,7 +290,9 @@ static int tracefs_apply_options(struct super_block *sb)
inode->i_mode |= opts->mode;
inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
return 0;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e26b10132d47..8e03b3d3f5fa 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev)
vma = prev;
else
@@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index 667e297f59b1..32fa02945f73 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -36,9 +36,7 @@ xfs_flush_bdev_async(
return;
}
- bio_init(bio, NULL, 0);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ bio_init(bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
bio->bi_private = done;
bio->bi_end_io = xfs_flush_bdev_async_endio;
@@ -61,10 +59,9 @@ xfs_rw_bdev(
if (is_vmalloc && op == REQ_OP_WRITE)
flush_kernel_vmap_range(data, count);
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
- bio->bi_opf = op | REQ_META | REQ_SYNC;
do {
struct page *page = kmem_to_page(data);
@@ -74,10 +71,9 @@ xfs_rw_bdev(
while (bio_add_page(bio, page, len, off) != len) {
struct bio *prev = bio;
- bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- bio_copy_dev(bio, prev);
+ bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
- bio->bi_opf = prev->bi_opf;
bio_chain(prev, bio);
submit_bio(prev);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b45e0d50a405..ae87fd95b17e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1440,12 +1440,10 @@ next_chunk:
atomic_inc(&bp->b_io_remaining);
nr_pages = bio_max_segs(total_nr_pages);
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio_set_dev(bio, bp->b_target->bt_bdev);
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 89fec9a18c34..16f9edbda4eb 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1883,19 +1883,19 @@ xlog_write_iclog(
return;
}
- bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
- bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
- iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
- iclog->ic_bio.bi_end_io = xlog_bio_end_io;
- iclog->ic_bio.bi_private = iclog;
-
/*
* We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
* IOs coming immediately after this one. This prevents the block layer
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+ howmany(count, PAGE_SIZE),
+ REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+
if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4c0dee78b2f8..d84714e4e46a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1753,6 +1753,11 @@ xfs_remount_ro(
};
int error;
+ /* Flush all the dirty data to disk. */
+ error = sync_filesystem(mp->m_super);
+ if (error)
+ return error;
+
/*
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
@@ -1831,8 +1836,6 @@ xfs_fs_reconfigure(
if (error)
return error;
- sync_filesystem(mp->m_super);
-
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b76dfb310ab6..d331b52592a0 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -692,12 +692,11 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (!nr_pages)
return 0;
- bio = bio_alloc(GFP_NOFS, nr_pages);
- bio_set_dev(bio, bdev);
+ bio = bio_alloc(bdev, nr_pages,
+ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_ioprio = iocb->ki_ioprio;
- bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
if (iocb->ki_flags & IOCB_DSYNC)
bio->bi_opf |= REQ_FUA;
@@ -1541,10 +1540,8 @@ static int zonefs_read_super(struct super_block *sb)
if (!page)
return -ENOMEM;
- bio_init(&bio, &bio_vec, 1);
+ bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = 0;
- bio.bi_opf = REQ_OP_READ;
- bio_set_dev(&bio, sb->s_bdev);
bio_add_page(&bio, page, PAGE_SIZE, 0);
ret = submit_bio_wait(&bio);