aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile3
-rw-r--r--fs/afs/cell.c1
-rw-r--r--fs/aio.c94
-rw-r--r--fs/autofs/autofs_i.h3
-rw-r--r--fs/autofs/inode.c19
-rw-r--r--fs/binfmt_aout.c83
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/binfmt_script.c57
-rw-r--r--fs/btrfs/acl.c9
-rw-r--r--fs/btrfs/async-thread.c10
-rw-r--r--fs/btrfs/backref.c22
-rw-r--r--fs/btrfs/compression.c253
-rw-r--r--fs/btrfs/compression.h52
-rw-r--r--fs/btrfs/ctree.c74
-rw-r--r--fs/btrfs/ctree.h95
-rw-r--r--fs/btrfs/delayed-ref.c15
-rw-r--r--fs/btrfs/delayed-ref.h11
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/disk-io.c39
-rw-r--r--fs/btrfs/extent-tree.c292
-rw-r--r--fs/btrfs/extent_io.c89
-rw-r--r--fs/btrfs/extent_io.h15
-rw-r--r--fs/btrfs/extent_map.c5
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/inode.c207
-rw-r--r--fs/btrfs/ioctl.c60
-rw-r--r--fs/btrfs/locking.c108
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c31
-rw-r--r--fs/btrfs/qgroup.c372
-rw-r--r--fs/btrfs/qgroup.h120
-rw-r--r--fs/btrfs/ref-verify.c4
-rw-r--r--fs/btrfs/relocation.c119
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c49
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c282
-rw-r--r--fs/btrfs/volumes.c202
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zlib.c45
-rw-r--r--fs/btrfs/zstd.c316
-rw-r--r--fs/ceph/snap.c3
-rw-r--r--fs/crypto/keyinfo.c4
-rw-r--r--fs/debugfs/inode.c4
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/ecryptfs/crypto.c5
-rw-r--r--fs/eventpoll.c173
-rw-r--r--fs/exec.c15
-rw-r--r--fs/ext2/dir.c35
-rw-r--r--fs/ext2/ext2.h17
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c30
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c44
-rw-r--r--fs/ext2/symlink.c2
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/f2fs/debug.c20
-rw-r--r--fs/f2fs/f2fs.h4
-rw-r--r--fs/f2fs/super.c5
-rw-r--r--fs/fat/file.c1
-rw-r--r--fs/file.c1
-rw-r--r--fs/fs_types.c105
-rw-r--r--fs/gfs2/glops.c1
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c191
-rw-r--r--fs/gfs2/lops.h4
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/recovery.c123
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c15
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/kernfs/file.c31
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c15
-rw-r--r--fs/locks.c32
-rw-r--r--fs/namei.c4
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nfs/nfs4idmap.c31
-rw-r--r--fs/nfs/write.c11
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c267
-rw-r--r--fs/notify/fanotify/fanotify.h116
-rw-r--r--fs/notify/fanotify/fanotify_user.c373
-rw-r--r--fs/notify/fsnotify.c15
-rw-r--r--fs/notify/inotify/inotify.h1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c5
-rw-r--r--fs/notify/mark.c42
-rw-r--r--fs/notify/notification.c42
-rw-r--r--fs/ocfs2/alloc.c159
-rw-r--r--fs/ocfs2/cluster/nodemanager.c14
-rw-r--r--fs/ocfs2/dlmglue.c5
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/slot_map.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/proc/array.c16
-rw-r--r--fs/proc/base.c78
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c16
-rw-r--r--fs/proc/stat.c89
-rw-r--r--fs/proc/task_mmu.c30
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/proc/thread_self.c16
-rw-r--r--fs/read_write.c6
-rw-r--r--fs/select.c4
-rw-r--r--fs/splice.c14
-rw-r--r--fs/statfs.c14
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/timerfd.c4
-rw-r--r--fs/udf/super.c51
-rw-r--r--fs/utimes.c10
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c74
-rw-r--r--fs/xfs/libxfs/xfs_attr.c17
-rw-r--r--fs/xfs/libxfs/xfs_attr.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c21
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c302
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c17
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c100
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c10
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c11
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c3
-rw-r--r--fs/xfs/libxfs/xfs_types.c24
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/scrub/agheader.c10
-rw-r--r--fs/xfs/scrub/agheader_repair.c12
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bmap.c27
-rw-r--r--fs/xfs/scrub/dir.c6
-rw-r--r--fs/xfs/scrub/ialloc.c330
-rw-r--r--fs/xfs/scrub/repair.c3
-rw-r--r--fs/xfs/scrub/repair.h3
-rw-r--r--fs/xfs/scrub/rtbitmap.c5
-rw-r--r--fs/xfs/scrub/trace.h45
-rw-r--r--fs/xfs/xfs_aops.c266
-rw-r--r--fs/xfs/xfs_aops.h24
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c9
-rw-r--r--fs/xfs/xfs_buf.c72
-rw-r--r--fs/xfs/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_file.c31
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_inode.c769
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_iomap.c518
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_log_recover.c14
-rw-r--r--fs/xfs/xfs_mount.c5
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_ondisk.h21
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_reflink.c150
-rw-r--r--fs/xfs/xfs_reflink.h18
-rw-r--r--fs/xfs/xfs_super.c22
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h115
-rw-r--r--fs/xfs/xfs_trans_bmap.c1
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_extfree.c1
-rw-r--r--fs/xfs/xfs_trans_refcount.c1
-rw-r--r--fs/xfs/xfs_trans_rmap.c1
-rw-r--r--fs/xfs/xfs_xattr.c3
201 files changed, 5828 insertions, 3219 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..23fcd8c164a3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
- stack.o fs_struct.o statfs.o fs_pin.o nsfs.o
+ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
+ fs_types.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index cf445dbd5f2e..9de46116c749 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -173,6 +173,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
rcu_assign_pointer(cell->vl_servers, vllist);
cell->dns_expiry = TIME64_MAX;
+ __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags);
} else {
cell->dns_expiry = ktime_get_real_seconds();
}
diff --git a/fs/aio.c b/fs/aio.c
index aaaaf4d12c73..38b741aef0bf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -167,9 +167,13 @@ struct kioctx {
unsigned id;
};
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
struct fsync_iocb {
- struct work_struct work;
struct file *file;
+ struct work_struct work;
bool datasync;
};
@@ -183,8 +187,15 @@ struct poll_iocb {
struct work_struct work;
};
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
struct aio_kiocb {
union {
+ struct file *ki_filp;
struct kiocb rw;
struct fsync_iocb fsync;
struct poll_iocb poll;
@@ -1060,6 +1071,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)
{
if (refcount_read(&iocb->ki_refcnt) == 0 ||
refcount_dec_and_test(&iocb->ki_refcnt)) {
+ if (iocb->ki_filp)
+ fput(iocb->ki_filp);
percpu_ref_put(&iocb->ki_ctx->reqs);
kmem_cache_free(kiocb_cachep, iocb);
}
@@ -1424,7 +1437,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
file_end_write(kiocb->ki_filp);
}
- fput(kiocb->ki_filp);
aio_complete(iocb, res, res2);
}
@@ -1432,9 +1444,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
{
int ret;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp))
- return -EBADF;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
@@ -1451,7 +1460,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = ioprio_check_cap(iocb->aio_reqprio);
if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret);
- goto out_fput;
+ return ret;
}
req->ki_ioprio = iocb->aio_reqprio;
@@ -1460,14 +1469,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
- goto out_fput;
+ return ret;
req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
return 0;
-
-out_fput:
- fput(req->ki_filp);
- return ret;
}
static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
@@ -1521,24 +1526,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
if (ret)
return ret;
file = req->ki_filp;
-
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_fput;
+ return -EBADF;
ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
aio_rw_done(req, call_read_iter(file, req, &iter));
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1555,16 +1555,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
return ret;
file = req->ki_filp;
- ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_fput;
- ret = -EINVAL;
+ return -EBADF;
if (unlikely(!file->f_op->write_iter))
- goto out_fput;
+ return -EINVAL;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- goto out_fput;
+ return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
/*
@@ -1582,9 +1580,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
aio_rw_done(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
-out_fput:
- if (unlikely(ret))
- fput(file);
return ret;
}
@@ -1594,7 +1589,6 @@ static void aio_fsync_work(struct work_struct *work)
int ret;
ret = vfs_fsync(req->file, req->datasync);
- fput(req->file);
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
}
@@ -1605,13 +1599,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
iocb->aio_rw_flags))
return -EINVAL;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
- if (unlikely(!req->file->f_op->fsync)) {
- fput(req->file);
+ if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
- }
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
@@ -1621,10 +1610,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
{
- struct file *file = iocb->poll.file;
-
aio_complete(iocb, mangle_poll(mask), 0);
- fput(file);
}
static void aio_poll_complete_work(struct work_struct *work)
@@ -1680,6 +1666,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
__poll_t mask = key_to_poll(key);
+ unsigned long flags;
req->woken = true;
@@ -1688,10 +1675,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (!(mask & req->events))
return 0;
- /* try to complete the iocb inline if we can: */
- if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+ /*
+ * Try to complete the iocb inline if we can. Use
+ * irqsave/irqrestore because not all filesystems (e.g. fuse)
+ * call this function with IRQs disabled and because IRQs
+ * have to be disabled before ctx_lock is obtained.
+ */
+ if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
list_del(&iocb->ki_list);
- spin_unlock(&iocb->ki_ctx->ctx_lock);
+ spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
list_del_init(&req->wait.entry);
aio_poll_complete(iocb, mask);
@@ -1743,9 +1735,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
INIT_WORK(&req->work, aio_poll_complete_work);
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
- req->file = fget(iocb->aio_fildes);
- if (unlikely(!req->file))
- return -EBADF;
req->head = NULL;
req->woken = false;
@@ -1788,10 +1777,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
spin_unlock_irq(&ctx->ctx_lock);
out:
- if (unlikely(apt.error)) {
- fput(req->file);
+ if (unlikely(apt.error))
return apt.error;
- }
if (mask)
aio_poll_complete(aiocb, mask);
@@ -1829,6 +1816,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
if (unlikely(!req))
goto out_put_reqs_available;
+ req->ki_filp = fget(iocb->aio_fildes);
+ ret = -EBADF;
+ if (unlikely(!req->ki_filp))
+ goto out_put_req;
+
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -2199,11 +2191,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
- compat_long_t, min_nr,
- compat_long_t, nr,
- struct io_event __user *, events,
- struct old_timespec32 __user *, timeout)
+SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
+ __s32, min_nr,
+ __s32, nr,
+ struct io_event __user *, events,
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 3e59f0ed777b..70c132acdab1 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -105,6 +105,7 @@ struct autofs_wait_queue {
#define AUTOFS_SBI_CATATONIC 0x0001
#define AUTOFS_SBI_STRICTEXPIRE 0x0002
+#define AUTOFS_SBI_IGNORE 0x0004
struct autofs_sb_info {
u32 magic;
@@ -215,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
return -EINVAL;
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
+ /* We don't expect -EAGAIN */
+ pipe->f_flags &= ~O_NONBLOCK;
return 0;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 078992eee299..80597b88718b 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -82,18 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",maxproto=%d", sbi->max_proto);
if (autofs_type_offset(sbi->type))
- seq_printf(m, ",offset");
+ seq_puts(m, ",offset");
else if (autofs_type_direct(sbi->type))
- seq_printf(m, ",direct");
+ seq_puts(m, ",direct");
else
- seq_printf(m, ",indirect");
+ seq_puts(m, ",indirect");
if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE)
- seq_printf(m, ",strictexpire");
+ seq_puts(m, ",strictexpire");
+ if (sbi->flags & AUTOFS_SBI_IGNORE)
+ seq_puts(m, ",ignore");
#ifdef CONFIG_CHECKPOINT_RESTORE
if (sbi->pipe)
seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
else
- seq_printf(m, ",pipe_ino=-1");
+ seq_puts(m, ",pipe_ino=-1");
#endif
return 0;
}
@@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = {
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire};
+ Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
+ Opt_ignore};
static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
@@ -124,6 +127,7 @@ static const match_table_t tokens = {
{Opt_direct, "direct"},
{Opt_offset, "offset"},
{Opt_strictexpire, "strictexpire"},
+ {Opt_ignore, "ignore"},
{Opt_err, NULL}
};
@@ -206,6 +210,9 @@ static int parse_options(char *options,
case Opt_strictexpire:
sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
+ break;
default:
return 1;
}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca9725f18e00..1fefd87eb4b4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -29,97 +29,14 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/a.out-core.h>
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
-#ifdef CONFIG_COREDUMP
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-static int aout_core_dump(struct coredump_params *cprm)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- void __user *dump_start;
- int dump_size;
- struct user dump;
-#ifdef __alpha__
-# define START_DATA(u) ((void __user *)u.start_data)
-#else
-# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
- u.start_code))
-#endif
-# define START_STACK(u) ((void __user *)u.start_stack)
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
- dump.u_ar0 = offsetof(struct user, regs);
- dump.signal = cprm->siginfo->si_signo;
- aout_dump_thread(cprm->regs, &dump);
-
-/* If the size of the dump file exceeds the rlimit, then see what would happen
- if we wrote the stack, but not the data area. */
- if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
- dump.u_dsize = 0;
-
-/* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_ssize = 0;
-
-/* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
-/* struct user */
- if (!dump_emit(cprm, &dump, sizeof(dump)))
- goto end_coredump;
-/* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
- goto end_coredump;
-/* now we start writing out the user space info */
- set_fs(USER_DS);
-/* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-/* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_emit(cprm, dump_start, dump_size))
- goto end_coredump;
- }
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#else
-#define aout_core_dump NULL
-#endif
-
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
- .core_dump = aout_core_dump,
- .min_coredump = PAGE_SIZE
};
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 54207327f98f..7d09d125f148 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -57,8 +57,6 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
- int, int, unsigned long);
#ifdef CONFIG_USELIB
static int load_elf_library(struct file *);
@@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
#ifndef elf_map
static unsigned long elf_map(struct file *filep, unsigned long addr,
- struct elf_phdr *eppnt, int prot, int type,
+ const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
{
unsigned long map_addr;
@@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
#endif /* !elf_map */
-static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
* header pointed to by elf_ex, into a newly allocated array. The caller is
* responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
*/
-static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
+static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, size, err = -1;
+ int retval, err = -1;
loff_t pos = elf_ex->e_phoff;
+ unsigned int size;
/*
* If the size of this structure has changed, then punt, since
@@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
goto out;
/* Sanity check the number of program headers... */
- if (elf_ex->e_phnum < 1 ||
- elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr))
- goto out;
-
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
- struct list_head *t;
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_add(&ets->list, &info->thread_list);
}
- list_for_each(t, &info->thread_list) {
+ list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- ets = list_entry(t, struct elf_thread_status, list);
sz = elf_dump_thread_status(siginfo->si_signo, ets);
info->thread_status_size += sz;
}
@@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info)
static int write_note_info(struct elf_note_info *info,
struct coredump_params *cprm)
{
+ struct elf_thread_status *ets;
int i;
- struct list_head *t;
for (i = 0; i < info->numnote; i++)
if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
- list_for_each(t, &info->thread_list) {
- struct elf_thread_status *tmp =
- list_entry(t, struct elf_thread_status, list);
-
- for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm))
+ list_for_each_entry(ets, &info->thread_list, list) {
+ for (i = 0; i < ets->num_notes; i++)
+ if (!writenote(&ets->notes[i], cprm))
return 0;
}
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d0078cbb718b..e996174cbfc0 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,13 +14,30 @@
#include <linux/err.h>
#include <linux/fs.h>
+static inline bool spacetab(char c) { return c == ' ' || c == '\t'; }
+static inline char *next_non_spacetab(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (!spacetab(*first))
+ return first;
+ return NULL;
+}
+static inline char *next_terminator(char *first, const char *last)
+{
+ for (; first <= last; first++)
+ if (spacetab(*first) || !*first)
+ return first;
+ return NULL;
+}
+
static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
- char *cp;
+ char *cp, *buf_end;
struct file *file;
int retval;
+ /* Not ours to exec if we don't start with "#!". */
if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
return -ENOEXEC;
@@ -33,23 +50,41 @@ static int load_script(struct linux_binprm *bprm)
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
return -ENOENT;
- /*
- * This section does the #! interpretation.
- * Sorta complicated, but hopefully it will work. -TYT
- */
-
+ /* Release since we are not mapping a binary into memory. */
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = NULL;
- for (cp = bprm->buf+2;; cp++) {
- if (cp >= bprm->buf + BINPRM_BUF_SIZE)
+ /*
+ * This section handles parsing the #! line into separate
+ * interpreter path and argument strings. We must be careful
+ * because bprm->buf is not yet guaranteed to be NUL-terminated
+ * (though the buffer will have trailing NUL padding when the
+ * file size was smaller than the buffer size).
+ *
+ * We do not want to exec a truncated interpreter path, so either
+ * we find a newline (which indicates nothing is truncated), or
+ * we find a space/tab/NUL after the interpreter path (which
+ * itself may be preceded by spaces/tabs). Truncating the
+ * arguments is fine: the interpreter can re-read the script to
+ * parse them on its own.
+ */
+ buf_end = bprm->buf + sizeof(bprm->buf) - 1;
+ cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n');
+ if (!cp) {
+ cp = next_non_spacetab(bprm->buf + 2, buf_end);
+ if (!cp)
+ return -ENOEXEC; /* Entire buf is spaces/tabs */
+ /*
+ * If there is no later space/tab/NUL we must assume the
+ * interpreter path is truncated.
+ */
+ if (!next_terminator(cp, buf_end))
return -ENOEXEC;
- if (!*cp || (*cp == '\n'))
- break;
+ cp = buf_end;
}
+ /* NUL-terminate the buffer and any trailing spaces/tabs. */
*cp = '\0';
-
while (cp > bprm->buf) {
cp--;
if ((*cp == ' ') || (*cp == '\t'))
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c957ea6f..5810463dc6d2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -9,6 +9,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/posix_acl.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
}
if (acl) {
+ unsigned int nofs_flag;
+
size = posix_acl_xattr_size(acl->a_count);
+ /*
+ * We're holding a transaction handle, so use a NOFS memory
+ * allocation context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
value = kmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d522494698fa..122cb97c7909 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
}
if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
+ ret->current_active, name);
else
- ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->current_active, "btrfs",
- name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
+ ret->current_active, name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 78556447e1d5..11459fe84a29 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -712,7 +712,7 @@ out:
* read tree blocks and add keys where required.
*/
static int add_missing_keys(struct btrfs_fs_info *fs_info,
- struct preftrees *preftrees)
+ struct preftrees *preftrees, bool lock)
{
struct prelim_ref *ref;
struct extent_buffer *eb;
@@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- btrfs_tree_read_lock(eb);
+ if (lock)
+ btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
else
btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
- btrfs_tree_read_unlock(eb);
+ if (lock)
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
cond_resched();
@@ -1227,7 +1229,7 @@ again:
btrfs_release_path(path);
- ret = add_missing_keys(fs_info, &preftrees);
+ ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
if (ret)
goto out;
@@ -1288,11 +1290,15 @@ again:
ret = -EIO;
goto out;
}
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+ if (!path->skip_locking) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_read(eb);
+ }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6896ea60c843..4f2a8ae0aa42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -731,6 +731,28 @@ struct heuristic_ws {
struct list_head list;
};
+static struct workspace_manager heuristic_wsm;
+
+static void heuristic_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
+}
+
+static void heuristic_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&heuristic_wsm);
+}
+
+static struct list_head *heuristic_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&heuristic_wsm, level);
+}
+
+static void heuristic_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&heuristic_wsm, ws);
+}
+
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -743,7 +765,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(void)
+static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
@@ -770,65 +792,59 @@ fail:
return ERR_PTR(-ENOMEM);
}
-struct workspaces_list {
- struct list_head idle_ws;
- spinlock_t ws_lock;
- /* Number of free workspaces */
- int free_ws;
- /* Total number of allocated workspaces */
- atomic_t total_ws;
- /* Waiters for a free workspace */
- wait_queue_head_t ws_wait;
+const struct btrfs_compress_op btrfs_heuristic_compress = {
+ .init_workspace_manager = heuristic_init_workspace_manager,
+ .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
+ .get_workspace = heuristic_get_workspace,
+ .put_workspace = heuristic_put_workspace,
+ .alloc_workspace = alloc_heuristic_ws,
+ .free_workspace = free_heuristic_ws,
};
-static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
-
-static struct workspaces_list btrfs_heuristic_ws;
-
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
+ /* The heuristic is represented as compression type 0 */
+ &btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
-void __init btrfs_init_compress(void)
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops)
{
struct list_head *workspace;
- int i;
- INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws);
- spin_lock_init(&btrfs_heuristic_ws.ws_lock);
- atomic_set(&btrfs_heuristic_ws.total_ws, 0);
- init_waitqueue_head(&btrfs_heuristic_ws.ws_wait);
+ wsm->ops = ops;
- workspace = alloc_heuristic_ws();
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ init_waitqueue_head(&wsm->ws_wait);
+
+ /*
+ * Preallocate one workspace for each compression type so we can
+ * guarantee forward progress in the worst case
+ */
+ workspace = wsm->ops->alloc_workspace(0);
if (IS_ERR(workspace)) {
pr_warn(
- "BTRFS: cannot preallocate heuristic workspace, will try later\n");
+ "BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
- atomic_set(&btrfs_heuristic_ws.total_ws, 1);
- btrfs_heuristic_ws.free_ws = 1;
- list_add(workspace, &btrfs_heuristic_ws.idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+ list_add(workspace, &wsm->idle_ws);
}
+}
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
- spin_lock_init(&btrfs_comp_ws[i].ws_lock);
- atomic_set(&btrfs_comp_ws[i].total_ws, 0);
- init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+{
+ struct list_head *ws;
- /*
- * Preallocate one workspace for each compression type so
- * we can guarantee forward progress in the worst case
- */
- workspace = btrfs_compress_op[i]->alloc_workspace();
- if (IS_ERR(workspace)) {
- pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n");
- } else {
- atomic_set(&btrfs_comp_ws[i].total_ws, 1);
- btrfs_comp_ws[i].free_ws = 1;
- list_add(workspace, &btrfs_comp_ws[i].idle_ws);
- }
+ while (!list_empty(&wsman->idle_ws)) {
+ ws = wsman->idle_ws.next;
+ list_del(ws);
+ wsman->ops->free_workspace(ws);
+ atomic_dec(&wsman->total_ws);
}
}
@@ -838,11 +854,11 @@ void __init btrfs_init_compress(void)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-static struct list_head *__find_workspace(int type, bool heuristic)
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level)
{
struct list_head *workspace;
int cpus = num_online_cpus();
- int idx = type - 1;
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
@@ -850,19 +866,11 @@ static struct list_head *__find_workspace(int type, bool heuristic)
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
@@ -893,10 +901,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- if (heuristic)
- workspace = alloc_heuristic_ws();
- else
- workspace = btrfs_compress_op[idx]->alloc_workspace();
+ workspace = wsm->ops->alloc_workspace(level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -927,85 +932,47 @@ again:
return workspace;
}
-static struct list_head *find_workspace(int type)
+static struct list_head *get_workspace(int type, int level)
{
- return __find_workspace(type, false);
+ return btrfs_compress_op[type]->get_workspace(level);
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-static void __free_workspace(int type, struct list_head *workspace,
- bool heuristic)
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
{
- int idx = type - 1;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
- if (heuristic) {
- idle_ws = &btrfs_heuristic_ws.idle_ws;
- ws_lock = &btrfs_heuristic_ws.ws_lock;
- total_ws = &btrfs_heuristic_ws.total_ws;
- ws_wait = &btrfs_heuristic_ws.ws_wait;
- free_ws = &btrfs_heuristic_ws.free_ws;
- } else {
- idle_ws = &btrfs_comp_ws[idx].idle_ws;
- ws_lock = &btrfs_comp_ws[idx].ws_lock;
- total_ws = &btrfs_comp_ws[idx].total_ws;
- ws_wait = &btrfs_comp_ws[idx].ws_wait;
- free_ws = &btrfs_comp_ws[idx].free_ws;
- }
+ idle_ws = &wsm->idle_ws;
+ ws_lock = &wsm->ws_lock;
+ total_ws = &wsm->total_ws;
+ ws_wait = &wsm->ws_wait;
+ free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
- list_add(workspace, idle_ws);
+ list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
- if (heuristic)
- free_heuristic_ws(workspace);
- else
- btrfs_compress_op[idx]->free_workspace(workspace);
+ wsm->ops->free_workspace(ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
-static void free_workspace(int type, struct list_head *ws)
+static void put_workspace(int type, struct list_head *ws)
{
- return __free_workspace(type, ws, false);
-}
-
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
- struct list_head *workspace;
- int i;
-
- while (!list_empty(&btrfs_heuristic_ws.idle_ws)) {
- workspace = btrfs_heuristic_ws.idle_ws.next;
- list_del(workspace);
- free_heuristic_ws(workspace);
- atomic_dec(&btrfs_heuristic_ws.total_ws);
- }
-
- for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
- workspace = btrfs_comp_ws[i].idle_ws.next;
- list_del(workspace);
- btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&btrfs_comp_ws[i].total_ws);
- }
- }
+ return btrfs_compress_op[type]->put_workspace(ws);
}
/*
@@ -1037,18 +1004,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_in,
unsigned long *total_out)
{
+ int type = btrfs_compress_type(type_level);
+ int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
- int type = type_level & 0xF;
-
- workspace = find_workspace(type);
- btrfs_compress_op[type - 1]->set_level(workspace, type_level);
- ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ workspace = get_workspace(type, level);
+ ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
start, pages,
out_pages,
total_in, total_out);
- free_workspace(type, workspace);
+ put_workspace(type, workspace);
return ret;
}
@@ -1072,9 +1038,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int ret;
int type = cb->compress_type;
- workspace = find_workspace(type);
- ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
- free_workspace(type, workspace);
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ put_workspace(type, workspace);
return ret;
}
@@ -1090,19 +1056,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
struct list_head *workspace;
int ret;
- workspace = find_workspace(type);
-
- ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ workspace = get_workspace(type, 0);
+ ret = btrfs_compress_op[type]->decompress(workspace, data_in,
dest_page, start_byte,
srclen, destlen);
+ put_workspace(type, workspace);
- free_workspace(type, workspace);
return ret;
}
+void __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->init_workspace_manager();
+}
+
void __cold btrfs_exit_compress(void)
{
- free_workspaces();
+ int i;
+
+ for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
+ btrfs_compress_op[i]->cleanup_workspace_manager();
}
/*
@@ -1513,7 +1489,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
- struct list_head *ws_list = __find_workspace(0, true);
+ struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
@@ -1582,18 +1558,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
}
out:
- __free_workspace(0, ws_list, true);
+ put_workspace(0, ws_list);
return ret;
}
-unsigned int btrfs_compress_str2level(const char *str)
+/*
+ * Convert the compression suffix (eg. after "zlib" starting with ":") to
+ * level, unrecognized string will set the default level
+ */
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
- if (strncmp(str, "zlib", 4) != 0)
+ unsigned int level = 0;
+ int ret;
+
+ if (!type)
return 0;
- /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */
- if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0)
- return str[5] - '0';
+ if (str[0] == ':') {
+ ret = kstrtouint(str + 1, 10, &level);
+ if (ret)
+ level = 0;
+ }
+
+ level = btrfs_compress_op[type]->set_level(level);
- return BTRFS_ZLIB_DEFAULT_LEVEL;
+ return level;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ddda9b80bf20..9976fe0f7526 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -64,6 +64,16 @@ struct compressed_bio {
u32 sums;
};
+static inline unsigned int btrfs_compress_type(unsigned int type_level)
+{
+ return (type_level & 0xF);
+}
+
+static inline unsigned int btrfs_compress_level(unsigned int type_level)
+{
+ return ((type_level & 0xF0) >> 4);
+}
+
void __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
@@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-unsigned btrfs_compress_str2level(const char *str);
+unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
@@ -97,8 +107,35 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_TYPES = 3,
};
+struct workspace_manager {
+ const struct btrfs_compress_op *ops;
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
+void btrfs_init_workspace_manager(struct workspace_manager *wsm,
+ const struct btrfs_compress_op *ops);
+struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
+ unsigned int level);
+void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
+void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+
struct btrfs_compress_op {
- struct list_head *(*alloc_workspace)(void);
+ void (*init_workspace_manager)(void);
+
+ void (*cleanup_workspace_manager)(void);
+
+ struct list_head *(*get_workspace)(unsigned int level);
+
+ void (*put_workspace)(struct list_head *ws);
+
+ struct list_head *(*alloc_workspace)(unsigned int level);
void (*free_workspace)(struct list_head *workspace);
@@ -119,9 +156,18 @@ struct btrfs_compress_op {
unsigned long start_byte,
size_t srclen, size_t destlen);
- void (*set_level)(struct list_head *ws, unsigned int type);
+ /*
+ * This bounds the level set by the user to be within range of a
+ * particular compression type. It returns the level that will be used
+ * if the level is out of bounds or the default if 0 is passed in.
+ */
+ unsigned int (*set_level)(unsigned int level);
};
+/* The heuristic workspaces are managed via the 0th workspace manager */
+#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+
+extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5a6c39b44c84..324df36d28bf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -13,6 +13,7 @@
#include "print-tree.h"
#include "locking.h"
#include "volumes.h"
+#include "qgroup.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
if (!p->nodes[i] || !p->locks[i])
continue;
- btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_READ_LOCK)
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- else if (p->locks[i] == BTRFS_WRITE_LOCK)
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
}
}
@@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
return eb;
btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
@@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
- btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock_blocking(eb_root);
free_extent_buffer(eb_root);
@@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
- btrfs_set_lock_blocking(parent);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(parent);
+ btrfs_set_lock_blocking_write(buf);
+ /*
+ * Before CoWing this block for later modification, check if it's
+ * the subtree root and do the delayed subtree trace if needed.
+ *
+ * Also We don't care about the error, as it's handled internally.
+ */
+ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
@@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking(parent);
+ btrfs_set_lock_blocking_write(parent);
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
@@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking(cur);
+ btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking(child);
+ btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
if (ret) {
btrfs_tree_unlock(child);
@@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left);
if (wret) {
@@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right);
if (wret) {
@@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2529,26 +2544,6 @@ done:
return ret;
}
-static void key_search_validate(struct extent_buffer *b,
- const struct btrfs_key *key,
- int level)
-{
-#ifdef CONFIG_BTRFS_ASSERT
- struct btrfs_disk_key disk_key;
-
- btrfs_cpu_key_to_disk(&disk_key, key);
-
- if (level == 0)
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_leaf, items[0].key),
- sizeof(disk_key)));
- else
- ASSERT(!memcmp_extent_buffer(b, &disk_key,
- offsetof(struct btrfs_node, ptrs[0].key),
- sizeof(disk_key)));
-#endif
-}
-
static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
int level, int *prev_cmp, int *slot)
{
@@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
return *prev_cmp;
}
- key_search_validate(b, key, level);
*slot = 0;
return 0;
@@ -3005,6 +2999,8 @@ again:
*/
prev_cmp = -1;
ret = key_search(b, key, level, &prev_cmp, &slot);
+ if (ret < 0)
+ goto done;
if (level != 0) {
int dec = 0;
@@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
+ btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(fs_info, right);
if (free_space < data_size)
@@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
+ btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(fs_info, left);
if (free_space < data_size) {
@@ -5156,6 +5152,10 @@ again:
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
sret = btrfs_bin_search(cur, min_key, level, &slot);
+ if (sret < 0) {
+ ret = sret;
+ goto out;
+ }
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7a2a2621f0d9..129d26226e70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -934,7 +934,8 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct mutex cleaner_delayed_iput_mutex;
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1074,10 +1075,13 @@ struct btrfs_fs_info {
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
- int scrub_workers_refcnt;
+ /*
+ * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
+ * running.
+ */
+ refcount_t scrub_workers_refcnt;
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_nocow_workers;
struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1199,6 +1203,24 @@ enum {
BTRFS_ROOT_MULTI_LOG_TASKS,
BTRFS_ROOT_DIRTY,
BTRFS_ROOT_DELETING,
+
+ /*
+ * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
+ *
+ * Set for the subvolume tree owning the reloc tree.
+ */
+ BTRFS_ROOT_DEAD_RELOC_TREE,
+};
+
+/*
+ * Record swapped tree blocks of a subvolume tree for delayed subtree trace
+ * code. For detail check comment in fs/btrfs/qgroup.c.
+ */
+struct btrfs_qgroup_swapped_blocks {
+ spinlock_t lock;
+ /* RM_EMPTY_ROOT() of above blocks[] */
+ bool swapped;
+ struct rb_root blocks[BTRFS_MAX_LEVEL];
};
/*
@@ -1312,6 +1334,14 @@ struct btrfs_root {
u64 nr_ordered_extents;
/*
+ * Not empty if this subvolume root has gone through tree block swap
+ * (relocation)
+ *
+ * Will be used by reloc_control::dirty_subvol_roots.
+ */
+ struct list_head reloc_dirty_list;
+
+ /*
* Number of currently running SEND ioctls to prevent
* manipulation with the read-only status via SUBVOL_SETFLAGS
*/
@@ -1328,6 +1358,9 @@ struct btrfs_root {
/* Number of active swapfiles */
atomic_t nr_swapfiles;
+ /* Record pairs of swapped blocks for qgroup */
+ struct btrfs_qgroup_swapped_blocks swapped_blocks;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -2775,7 +2808,8 @@ enum btrfs_flush_state {
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
ALLOC_CHUNK = 7,
- COMMIT_TRANS = 8,
+ ALLOC_CHUNK_FORCE = 8,
+ COMMIT_TRANS = 9,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
@@ -3181,8 +3215,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start,
- u64 len, int create);
+ u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes);
@@ -3254,6 +3287,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
@@ -3261,7 +3295,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
@@ -3415,31 +3449,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \
- ##args);\
-} while (0)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
-do { \
- DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
- if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \
- ##args); \
-} while (0)
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
+ fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
@@ -3490,21 +3510,18 @@ do { \
rcu_read_unlock(); \
} while (0)
-#ifdef CONFIG_BTRFS_ASSERT
-
__cold
static inline void assfail(const char *expr, const char *file, int line)
{
- pr_err("assertion failed: %s, file: %s, line: %d\n",
- expr, file, line);
- BUG();
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ pr_err("assertion failed: %s, file: %s, line: %d\n",
+ expr, file, line);
+ BUG();
+ }
}
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#else
-#define ASSERT(expr) ((void)0)
-#endif
/*
* Use that for functions that are conditionally exported for sanity tests but
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cad36c99a483..7d2a413df90d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
- head_ref->qgroup_reserved = 0;
- head_ref->qgroup_ref_root = 0;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
if (qrecord) {
if (ref_root && reserved) {
- head_ref->qgroup_ref_root = ref_root;
- head_ref->qgroup_reserved = reserved;
+ qrecord->data_rsv = reserved;
+ qrecord->data_rsv_refroot = ref_root;
}
-
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- WARN_ON(qrecord && head_ref->qgroup_ref_root
- && head_ref->qgroup_reserved
- && existing->qgroup_ref_root
- && existing->qgroup_reserved);
update_existing_head_ref(trans, existing, head_ref,
old_ref_mod);
/*
@@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
@@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root)) {
- record = kmalloc(sizeof(*record), GFP_NOFS);
+ record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d2af974f68a1..70606da440aa 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
- * For qgroup reserved space freeing.
- *
- * ref_root and reserved will be recorded after
- * BTRFS_ADD_DELAYED_EXTENT is called.
- * And will be used to free reserved qgroup space at
- * run_delayed_refs() time.
- */
- u64 qgroup_ref_root;
- u64 qgroup_reserved;
-
- /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8750c835f535..ee193c5222b2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
- NULL, NULL);
- dev_replace->tgtdev = btrfs_find_device(fs_info,
+ dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
+ src_devid, NULL, NULL, true);
+ dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL);
+ NULL, NULL, true);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
break;
default:
+ up_write(&dev_replace->rwsem);
result = -EINVAL;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ca1b7da6dd1b..f0cdb53f3e2d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
+#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
@@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (need_lock) {
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
@@ -1121,7 +1122,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
-buf->len,
fs_info->dirty_metadata_batch);
/* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1176,6 +1177,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
@@ -1219,6 +1221,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
@@ -1259,10 +1262,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
+ unsigned int nofs_flag;
int ret = 0;
uuid_le uuid = NULL_UUID_LE;
+ /*
+ * We're holding a transaction handle, so use a NOFS memory allocation
+ * context to avoid deadlock if reclaim happens.
+ */
+ nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1708,9 +1718,7 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -2102,7 +2110,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
@@ -2667,7 +2675,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
- mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2689,6 +2696,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2766,6 +2774,7 @@ int open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
INIT_LIST_HEAD(&fs_info->pinned_chunks);
@@ -4239,16 +4248,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
- if (!mutex_trylock(&head->mutex)) {
- refcount_inc(&head->refs);
- spin_unlock(&delayed_refs->lock);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref_head(head);
- spin_lock(&delayed_refs->lock);
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
- }
+
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -4264,12 +4266,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
- delayed_refs->num_heads--;
- if (head->processing == 0)
- delayed_refs->num_heads_ready--;
- atomic_dec(&delayed_refs->num_entries);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
+ btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d81035b7ea7d..994f0cc41799 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
}
}
- /* Also free its reserved qgroup space */
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
- head->qgroup_reserved);
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@@ -3013,8 +3010,7 @@ again:
}
if (run_all) {
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first_cached(&delayed_refs->href_root);
@@ -4280,10 +4276,14 @@ commit_trans:
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
- * more space is released.
+ * more space is released. We don't need to
+ * explicitly run the delayed iputs here because
+ * the commit_transaction would have woken up
+ * the cleaner.
*/
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+ ret = btrfs_wait_on_delayed_iputs(fs_info);
+ if (ret)
+ return ret;
goto again;
} else {
btrfs_end_transaction(trans);
@@ -4396,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4404,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
return 1;
/*
- * We need to take into account the global rsv because for all intents
- * and purposes it's used space. Don't worry about locking the
- * global_rsv, it doesn't change except when the transaction commits.
- */
- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
- bytes_used += calc_global_rsv_need_space(global_rsv);
-
- /*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
@@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 max_reclaim;
+ u64 async_pages;
u64 items;
long time_left;
unsigned long nr_pages;
@@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while (delalloc_bytes && loops < 3) {
- max_reclaim = min(delalloc_bytes, to_reclaim);
- nr_pages = max_reclaim >> PAGE_SHIFT;
+ nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ /*
+ * Triggers inode writeback for up to nr_pages. This will invoke
+ * ->writepages callback and trigger delalloc filling
+ * (btrfs_run_delalloc_range()).
+ */
btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+
/*
- * We need to wait for the async pages to actually start before
- * we do anything.
+ * We need to wait for the compressed pages to start before
+ * we continue.
*/
- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
- if (!max_reclaim)
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
goto skip_async;
- if (max_reclaim <= nr_pages)
- max_reclaim = 0;
+ /*
+ * Calculate how many compressed pages we want to be written
+ * before we continue. I.e if there are more async pages than we
+ * require wait_event will wait until nr_pages are written.
+ */
+ if (async_pages <= nr_pages)
+ async_pages = 0;
else
- max_reclaim -= nr_pages;
+ async_pages -= nr_pages;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
- (int)max_reclaim);
+ (int)async_pages);
skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
@@ -4808,6 +4810,7 @@ skip_async:
}
struct reserve_ticket {
+ u64 orig_bytes;
u64 bytes;
int error;
struct list_head list;
@@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (!bytes_needed)
return 0;
- /* See if there is enough pinned space to make this reservation */
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * See if there is enough pinned space to make this reservation, or if
+ * we have block groups that are going to be freed, allowing us to
+ * possibly do a chunk allocation the next loop through.
+ */
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
+ __percpu_counter_compare(&space_info->total_bytes_pinned,
+ bytes_needed,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
goto commit;
/*
@@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
* this reservation.
*/
if (space_info != delayed_rsv->space_info)
- return -ENOSPC;
+ goto enospc;
spin_lock(&delayed_rsv->lock);
reclaim_bytes += delayed_rsv->reserved;
@@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
if (__percpu_counter_compare(&space_info->total_bytes_pinned,
bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
- return -ENOSPC;
- }
+ BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
+ goto enospc;
commit:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return -ENOSPC;
-
return btrfs_commit_transaction(trans);
+enospc:
+ btrfs_end_transaction(trans);
+ return -ENOSPC;
}
/*
@@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
}
ret = do_chunk_alloc(trans,
btrfs_metadata_alloc_profile(fs_info),
- CHUNK_ALLOC_NO_FORCE);
+ (state == ALLOC_CHUNK) ?
+ CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
@@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* bunch of pinned space, so make sure we run the iputs before
* we do our pinned bytes check below.
*/
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
+ btrfs_wait_on_delayed_iputs(fs_info);
ret = may_commit_transaction(fs_info, space_info);
break;
@@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
{
struct reserve_ticket *ticket;
@@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head)
list_del_init(&ticket->list);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
+ if (ticket->bytes != ticket->orig_bytes)
+ return true;
}
+ return false;
}
/*
@@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We don't want to force a chunk allocation until we've tried
+ * pretty hard to reclaim space. Think of the case where we
+ * freed up a bunch of space and so have a lot of pinned space
+ * to reclaim. We would rather use that than possibly create a
+ * underutilized metadata chunk. So if this is our first run
+ * through the flushing state machine skip ALLOC_CHUNK_FORCE and
+ * commit the transaction. If nothing has changed the next go
+ * around then we can force a chunk allocation.
+ */
+ if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
+ flush_state++;
+
if (flush_state > COMMIT_TRANS) {
commit_cycles++;
if (commit_cycles > 2) {
- wake_all_tickets(&space_info->tickets);
- space_info->flush = 0;
+ if (wake_all_tickets(&space_info->tickets)) {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ commit_cycles--;
+ } else {
+ space_info->flush = 0;
+ }
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
@@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
INIT_WORK(work, btrfs_async_reclaim_metadata_space);
}
+static const enum btrfs_flush_state priority_flush_states[] = {
+ FLUSH_DELAYED_ITEMS_NR,
+ FLUSH_DELAYED_ITEMS,
+ ALLOC_CHUNK,
+};
+
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
u64 to_reclaim;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
+ int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
@@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
}
spin_unlock(&space_info->lock);
+ flush_state = 0;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim,
+ priority_flush_states[flush_state]);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
return;
}
spin_unlock(&space_info->lock);
-
- /*
- * Priority flushers can't wait on delalloc without
- * deadlocking.
- */
- if (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT)
- flush_state = ALLOC_CHUNK;
- } while (flush_state < COMMIT_TRANS);
+ } while (flush_state < ARRAY_SIZE(priority_flush_states));
}
static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- struct reserve_ticket *ticket, u64 orig_bytes)
+ struct reserve_ticket *ticket)
{
DEFINE_WAIT(wait);
+ u64 reclaim_bytes = 0;
int ret = 0;
spin_lock(&space_info->lock);
@@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
ret = ticket->error;
if (!list_empty(&ticket->list))
list_del_init(&ticket->list);
- if (ticket->bytes && ticket->bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket->bytes;
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- }
+ if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+ reclaim_bytes = ticket->orig_bytes - ticket->bytes;
spin_unlock(&space_info->lock);
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
return ret;
}
@@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket ticket;
u64 used;
+ u64 reclaim_bytes = 0;
int ret = 0;
ASSERT(orig_bytes);
@@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ticket.orig_bytes = orig_bytes;
ticket.bytes = orig_bytes;
ticket.error = 0;
init_waitqueue_head(&ticket.wait);
@@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
if (flush == BTRFS_RESERVE_FLUSH_ALL)
- return wait_reserve_ticket(fs_info, space_info, &ticket,
- orig_bytes);
+ return wait_reserve_ticket(fs_info, space_info, &ticket);
ret = 0;
priority_reclaim_metadata_space(fs_info, space_info, &ticket);
spin_lock(&space_info->lock);
if (ticket.bytes) {
- if (ticket.bytes < orig_bytes) {
- u64 num_bytes = orig_bytes - ticket.bytes;
- update_bytes_may_use(space_info, -num_bytes);
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags,
- num_bytes, 0);
-
- }
+ if (ticket.bytes < orig_bytes)
+ reclaim_bytes = orig_bytes - ticket.bytes;
list_del_init(&ticket.list);
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
+
+ if (reclaim_bytes)
+ space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
ASSERT(list_empty(&ticket.list));
return ret;
}
@@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
+static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 *metadata_bytes, u64 *qgroup_bytes)
+{
+ *metadata_bytes = 0;
+ *qgroup_bytes = 0;
+
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size)
+ *metadata_bytes = block_rsv->size - block_rsv->reserved;
+ if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+ *qgroup_bytes = block_rsv->qgroup_rsv_size -
+ block_rsv->qgroup_rsv_reserved;
+ spin_unlock(&block_rsv->lock);
+}
+
/**
* btrfs_inode_rsv_refill - refill the inode block rsv.
* @inode - the inode we are refilling.
@@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
{
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
- u64 num_bytes = 0;
- u64 qgroup_num_bytes = 0;
+ u64 num_bytes, last = 0;
+ u64 qgroup_num_bytes;
int ret = -ENOSPC;
- spin_lock(&block_rsv->lock);
- if (block_rsv->reserved < block_rsv->size)
- num_bytes = block_rsv->size - block_rsv->reserved;
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
- block_rsv->qgroup_rsv_reserved;
- spin_unlock(&block_rsv->lock);
-
+ calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
if (num_bytes == 0)
return 0;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
- if (ret)
- return ret;
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ do {
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
+ true);
+ if (ret)
+ return ret;
+ ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ last = num_bytes;
+ /*
+ * If we are fragmented we can end up with a lot of
+ * outstanding extents which will make our size be much
+ * larger than our reserved amount.
+ *
+ * If the reservation happens here, it might be very
+ * big though not needed in the end, if the delalloc
+ * flushing happens.
+ *
+ * If this is the case try and do the reserve again.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ calc_refill_bytes(block_rsv, &num_bytes,
+ &qgroup_num_bytes);
+ if (num_bytes == 0)
+ return 0;
+ }
+ } while (ret && last != num_bytes);
+
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&block_rsv->lock);
- } else
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ }
return ret;
}
@@ -8066,6 +8125,15 @@ loop:
return ret;
}
+#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
+do { \
+ struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
+ spin_lock(&__rsv->lock); \
+ btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
+ __rsv->size, __rsv->reserved); \
+ spin_unlock(&__rsv->lock); \
+} while (0)
+
static void dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
@@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info,
info->bytes_readonly);
spin_unlock(&info->lock);
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+
if (!dump_block_groups)
return;
@@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clean_tree_block(fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- btrfs_set_lock_blocking(buf);
+ btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -8917,7 +8991,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -8977,7 +9051,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
}
level--;
@@ -9089,7 +9163,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9131,7 +9205,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
clean_tree_block(fs_info, eb);
@@ -9298,7 +9372,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
memset(&wc->update_progress, 0,
@@ -9328,7 +9402,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking(path->nodes[level]);
+ btrfs_set_lock_blocking_write(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
ret = btrfs_lookup_extent_info(trans, fs_info,
@@ -9595,6 +9669,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
+ u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
@@ -9621,9 +9696,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo_used = btrfs_space_info_used(sinfo, true);
- if (btrfs_space_info_used(sinfo, true) + num_bytes +
- min_allocable_bytes <= sinfo->total_bytes) {
+ if (sinfo_used + num_bytes + min_allocable_bytes <=
+ sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -9632,6 +9708,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
+ if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
+ btrfs_info(cache->fs_info,
+ "unable to make block group %llu ro",
+ cache->key.objectid);
+ btrfs_info(cache->fs_info,
+ "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
+ sinfo_used, num_bytes, min_allocable_bytes);
+ dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ }
return ret;
}
@@ -10781,13 +10866,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
spin_lock(&trans->transaction->dirty_bgs_lock);
- if (!list_empty(&block_group->dirty_list)) {
- WARN_ON(1);
- }
- if (!list_empty(&block_group->io_list)) {
- WARN_ON(1);
- }
+ WARN_ON(!list_empty(&block_group->dirty_list));
+ WARN_ON(!list_empty(&block_group->io_list));
spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ed58c9a94a9..ab705183d749 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -147,7 +147,39 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd);
+static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
+{
+ blk_status_t ret = 0;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+ struct bio_vec bv;
+ struct extent_io_tree *tree = bio->bi_private;
+ u64 start;
+
+ mp_bvec_last_segment(bvec, &bv);
+ start = page_offset(bv.bv_page) + bv.bv_offset;
+
+ bio->bi_private = NULL;
+
+ if (tree->ops)
+ ret = tree->ops->submit_bio_hook(tree->private_data, bio,
+ mirror_num, bio_flags, start);
+ else
+ btrfsic_submit_bio(bio);
+
+ return blk_status_to_errno(ret);
+}
+
+static void flush_write_bio(struct extent_page_data *epd)
+{
+ if (epd->bio) {
+ int ret;
+
+ ret = submit_one_bio(epd->bio, 0, 0);
+ BUG_ON(ret < 0); /* -ENOMEM */
+ epd->bio = NULL;
+ }
+}
int __init extent_io_init(void)
{
@@ -281,8 +313,8 @@ do_insert:
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **prev_ret,
struct rb_node **next_ret,
+ struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
@@ -311,23 +343,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
if (parent_ret)
*parent_ret = prev;
- if (prev_ret) {
+ if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *prev_ret = prev;
+ *next_ret = prev;
prev = orig_prev;
}
- if (next_ret) {
+ if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
while (prev && offset < prev_entry->start) {
prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
- *next_ret = prev;
+ *prev_ret = prev;
}
return NULL;
}
@@ -338,12 +370,12 @@ tree_search_for_insert(struct extent_io_tree *tree,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
- struct rb_node *prev = NULL;
+ struct rb_node *next= NULL;
struct rb_node *ret;
- ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
+ ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret)
- return prev;
+ return next;
return ret;
}
@@ -585,7 +617,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (delete)
bits |= ~EXTENT_CTLBITS;
- bits |= EXTENT_FIRST_DELALLOC;
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
@@ -850,7 +881,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
- bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -2694,29 +2724,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- struct bio_vec bv;
- struct extent_io_tree *tree = bio->bi_private;
- u64 start;
-
- mp_bvec_last_segment(bvec, &bv);
- start = page_offset(bv.bv_page) + bv.bv_offset;
-
- bio->bi_private = NULL;
-
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags, start);
- else
- btrfsic_submit_bio(bio);
-
- return blk_status_to_errno(ret);
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @tree: tree so we can call our merge_bio hook
@@ -4011,17 +4018,6 @@ retry:
return ret;
}
-static void flush_write_bio(struct extent_page_data *epd)
-{
- if (epd->bio) {
- int ret;
-
- ret = submit_one_bio(epd->bio, 0, 0);
- BUG_ON(ret < 0); /* -ENOMEM */
- epd->bio = NULL;
- }
-}
-
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
@@ -4263,8 +4259,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
- len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
if (IS_ERR_OR_NULL(em))
return em;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9673be3f3d1f..08749e0b9c32 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,17 +18,16 @@
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
#define EXTENT_CLEAR_META_RESV (1U << 11)
-#define EXTENT_FIRST_DELALLOC (1U << 12)
-#define EXTENT_NEED_WAIT (1U << 13)
-#define EXTENT_DAMAGED (1U << 14)
-#define EXTENT_NORESERVE (1U << 15)
-#define EXTENT_QGROUP_RESERVED (1U << 16)
-#define EXTENT_CLEAR_DATA_RESV (1U << 17)
-#define EXTENT_DELALLOC_NEW (1U << 18)
+#define EXTENT_NEED_WAIT (1U << 12)
+#define EXTENT_DAMAGED (1U << 13)
+#define EXTENT_NORESERVE (1U << 14)
+#define EXTENT_QGROUP_RESERVED (1U << 15)
+#define EXTENT_CLEAR_DATA_RESV (1U << 16)
+#define EXTENT_DELALLOC_NEW (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
/*
* flags for bio submission. The high bits indicate the compression
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a042a193c120..928f729c55ba 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (!list_empty(&prev->list) || !list_empty(&next->list))
return 0;
+ ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
+ prev->block_start != EXTENT_MAP_DELALLOC);
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start == EXTENT_MAP_DELALLOC &&
- prev->block_start == EXTENT_MAP_DELALLOC) ||
(next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
next->block_start == extent_map_block_end(prev)))) {
return 1;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ef05a0121652..473f039fcd7c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -9,6 +9,7 @@
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
+/* used only during fiemap calls */
#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d38dc8c31533..34fe8a58b0e9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
&cached_state);
while (start < inode->i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
- start, len, 0);
+ em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7ade5769f691..82fdda8ff5ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
- u64 isize = i_size_read(inode);
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode,
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, isize, end + 1);
+ actual_end = min_t(u64, i_size_read(inode), end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent)
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline void submit_compressed_extents(struct inode *inode,
- struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_cow *async_cow)
{
+ struct inode *inode = async_cow->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct async_extent *async_extent;
u64 alloc_hint = 0;
@@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
+ /*
+ * ->inode could be NULL if async_cow_start has failed to compress,
+ * in which case we don't have anything to submit, yet we need to
+ * always adjust ->async_delalloc_pages as its paired with the init
+ * happening in cow_file_range_async
+ */
if (async_cow->inode)
- submit_compressed_extents(async_cow->inode, async_cow);
+ submit_compressed_extents(async_cow);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
BUG_ON(!async_cow); /* -ENOMEM */
- async_cow->inode = igrab(inode);
+ /*
+ * igrab is called higher up in the call chain, take only the
+ * lightweight reference for the callback lifetime
+ */
+ ihold(inode);
+ async_cow->inode = inode;
async_cow->fs_info = fs_info;
async_cow->locked_page = locked_page;
async_cow->start = start;
@@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
+int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
- struct inode *inode = private_data;
int ret;
int force_cow = need_force_cow(inode, start, end);
unsigned int write_flags = wbc_to_write_flags(wbc);
@@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode)
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
+ atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
@@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
+ if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
+ wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
+/**
+ * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
+ * @fs_info - the fs_info for this fs
+ * @return - EINTR if we were killed, 0 if nothing's pending
+ *
+ * This will wait on any delayed iputs that are currently running with KILLABLE
+ * set. Once they are all done running we will return, unless we are killed in
+ * which case we return EINTR. This helps in user operations like fallocate etc
+ * that might get blocked on the iputs.
+ */
+int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
+{
+ int ret = wait_event_killable(fs_info->delayed_iputs_wait,
+ atomic_read(&fs_info->nr_delayed_iputs) == 0);
+ if (ret)
+ return -EINTR;
+ return 0;
+}
+
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
@@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1);
int failures = 0;
for (;;) {
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
+ ret = btrfs_block_rsv_refill(root, rsv,
+ rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
return ERR_PTR(-ENOSPC);
}
+ /*
+ * Evict can generate a large amount of delayed refs without
+ * having a way to add space back since we exhaust our temporary
+ * block rsv. We aren't allowed to do FLUSH_ALL in this case
+ * because we could deadlock with so many things in the flushing
+ * code, so we have to try and hold some extra space to
+ * compensate for our delayed ref generation. If we can't get
+ * that space then we need see if we can steal our minimum from
+ * the global reserve. We will be ratelimited by the amount of
+ * space we have for the delayed refs rsv, so we'll end up
+ * committing and trying again.
+ */
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans) || !ret)
+ if (IS_ERR(trans) || !ret) {
+ if (!IS_ERR(trans)) {
+ trans->block_rsv = &fs_info->trans_block_rsv;
+ trans->bytes_reserved = delayed_refs_extra;
+ btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+ delayed_refs_extra, 1);
+ }
return trans;
+ }
/*
* Try to steal from the global reserve if there is space for
@@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
- u32 found_type;
+ u8 extent_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
@@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (ret < 0) {
err = ret;
goto out;
- }
-
- if (ret != 0) {
+ } else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
@@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- /* are we inside the extent that was found? */
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- found_type = found_key.type;
if (found_key.objectid != objectid ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
+ found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
@@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto next;
}
- found_type = btrfs_file_extent_type(leaf, item);
+ extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -6840,9 +6888,9 @@ next:
if (ret < 0) {
err = ret;
goto out;
- }
- if (ret > 0)
+ } else if (ret > 0) {
goto not_found;
+ }
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6853,19 +6901,22 @@ next:
goto not_found;
if (start > found_key.offset)
goto next;
+
+ /* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
- goto not_found_em;
+ em->block_start = EXTENT_MAP_HOLE;
+ goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item,
new_inline, em);
- if (found_type == BTRFS_FILE_EXTENT_REG ||
- found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
@@ -6916,7 +6967,6 @@ not_found:
em->start = start;
em->orig_start = start;
em->len = len;
-not_found_em:
em->block_start = EXTENT_MAP_HOLE;
insert:
btrfs_release_path(path);
@@ -6946,19 +6996,17 @@ out:
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
- u64 range_start = start;
+ u64 delalloc_start = start;
u64 end;
- u64 found;
- u64 found_end;
+ u64 delalloc_len;
+ u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return em;
/*
@@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
- found = count_range_bits(&inode->io_tree, &range_start,
+ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
- found_end = range_start + found;
- if (found_end < range_start)
- found_end = (u64)-1;
+ delalloc_end = delalloc_start + delalloc_len;
+ if (delalloc_end < delalloc_start)
+ delalloc_end = (u64)-1;
/*
- * we didn't find anything useful, return
- * the original results from get_extent()
+ * We didn't find anything useful, return the original results from
+ * get_extent()
*/
- if (range_start > end || found_end <= start) {
+ if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
- /* adjust the range_start to make sure it doesn't
- * go backwards from the start they passed in
+ /*
+ * Adjust the delalloc_start to make sure it doesn't go backwards from
+ * the start they passed in
*/
- range_start = max(start, range_start);
- found = found_end - range_start;
+ delalloc_start = max(start, delalloc_start);
+ delalloc_len = delalloc_end - delalloc_start;
- if (found > 0) {
- u64 hole_start = start;
- u64 hole_len = len;
+ if (delalloc_len > 0) {
+ u64 hole_start;
+ u64 hole_len;
+ const u64 hole_end = extent_map_end(hole_em);
em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
+ em->bdev = NULL;
+
+ ASSERT(hole_em);
/*
- * when btrfs_get_extent can't find anything it
- * returns one huge hole
+ * When btrfs_get_extent can't find anything it returns one
+ * huge hole
*
- * make sure what it found really fits our range, and
- * adjust to make sure it is based on the start from
- * the caller
+ * Make sure what it found really fits our range, and adjust to
+ * make sure it is based on the start from the caller
*/
- if (hole_em) {
- u64 calc_end = extent_map_end(hole_em);
-
- if (calc_end <= start || (hole_em->start > end)) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = calc_end - hole_start;
- }
+ if (hole_end <= start || hole_em->start > end) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = hole_end - hole_start;
}
- em->bdev = NULL;
- if (hole_em && range_start > hole_start) {
- /* our hole starts before our delalloc, so we
- * have to return just the parts of the hole
- * that go until the delalloc starts
+
+ if (hole_em && delalloc_start > hole_start) {
+ /*
+ * Our hole starts before our delalloc, so we have to
+ * return just the parts of the hole that go until the
+ * delalloc starts
*/
- em->len = min(hole_len,
- range_start - hole_start);
+ em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
- * don't adjust block start at all,
- * it is fixed at EXTENT_MAP_HOLE
+ * Don't adjust block start at all, it is fixed at
+ * EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
- em->start = range_start;
- em->len = found;
- em->orig_start = range_start;
+ /*
+ * Hole is out of passed range or it starts after
+ * delalloc range
+ */
+ em->start = delalloc_start;
+ em->len = delalloc_len;
+ em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = found;
+ em->block_len = delalloc_len;
}
} else {
return hole_em;
@@ -9912,7 +9964,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
btrfs_run_delalloc_work, NULL, NULL);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9c8e1734429c..494f0f10d70e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1642,7 +1642,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -3178,7 +3178,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
s_uuid = di_args->uuid;
rcu_read_lock();
- dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
+ NULL, true);
if (!dev) {
ret = -ENODEV;
@@ -3241,32 +3242,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
- u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
- u64 len = olen;
-
- if (loff + len == src->i_size)
- len = ALIGN(src->i_size, bs) - loff;
- /*
- * For same inode case we don't want our length pushed out past i_size
- * as comparing that data range makes no sense.
- *
- * This effectively means we require aligned extents for the single
- * inode case, whereas the other cases allow an unaligned length so long
- * as it ends at i_size.
- */
- if (dst == src && len != olen)
- return -EINVAL;
/*
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+ ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
@@ -3278,21 +3264,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
- int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
u64 i, tail_len, chunk_count;
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
- if (IS_SWAPFILE(src) || IS_SWAPFILE(dst))
- return -ETXTBSY;
-
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
- if (chunk_count == 0)
- num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
@@ -3908,14 +3883,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* be either compressed or non-compressed.
*/
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- return -EINVAL;
-
- if (IS_SWAPFILE(src) || IS_SWAPFILE(inode))
- return -ETXTBSY;
-
/*
* VFS's generic_remap_file_range_prep() protects us from cloning the
* eof block into the middle of a file, which would result in corruption
@@ -3991,6 +3958,13 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
else
btrfs_double_inode_lock(inode_in, inode_out);
+ /* don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Now that the inodes are locked, we need to start writeback ourselves
* and can not rely on the writeback from the VFS's generic helper
@@ -4381,7 +4355,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
@@ -4414,7 +4388,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4438,7 +4412,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
ret = btrfs_get_dev_stats(fs_info, sa);
- if (copy_to_user(arg, sa, sizeof(*sa)))
+ if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
@@ -4484,7 +4458,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
break;
}
- if (copy_to_user(arg, p, sizeof(*p)))
+ if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
out:
kfree(p);
@@ -4790,7 +4764,7 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if (arg) {
+ if ((ret == 0 || ret == -ECANCELED) && arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1da768e5ef75..82b84e4daad1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,43 +14,58 @@
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
-/*
- * if we currently have a spinning reader or writer lock
- * (indicated by the rw flag) this will bump the count
- * of blocking holders and drop the spinlock.
- */
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
/*
- * no lock is required. The lock owner may change if
- * we have a read lock, but it won't change to or away
- * from us. If we have the write lock, we are the owner
- * and it'll never change.
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
- if (rw == BTRFS_WRITE_LOCK) {
- if (atomic_read(&eb->blocking_writers) == 0) {
- WARN_ON(atomic_read(&eb->spinning_writers) != 1);
- atomic_dec(&eb->spinning_writers);
- btrfs_assert_tree_locked(eb);
- atomic_inc(&eb->blocking_writers);
- write_unlock(&eb->lock);
- }
- } else if (rw == BTRFS_READ_LOCK) {
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
- read_unlock(&eb->lock);
+ btrfs_assert_tree_read_locked(eb);
+ atomic_inc(&eb->blocking_readers);
+ WARN_ON(atomic_read(&eb->spinning_readers) == 0);
+ atomic_dec(&eb->spinning_readers);
+ read_unlock(&eb->lock);
+}
+
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ if (atomic_read(&eb->blocking_writers) == 0) {
+ WARN_ON(atomic_read(&eb->spinning_writers) != 1);
+ atomic_dec(&eb->spinning_writers);
+ btrfs_assert_tree_locked(eb);
+ atomic_inc(&eb->blocking_writers);
+ write_unlock(&eb->lock);
}
}
-/*
- * if we currently have a blocking lock, take the spinlock
- * and drop our blocking count
- */
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb)
+{
+ /*
+ * No lock is required. The lock owner may change if we have a read
+ * lock, but it won't change to or away from us. If we have the write
+ * lock, we are the owner and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+ BUG_ON(atomic_read(&eb->blocking_readers) == 0);
+ read_lock(&eb->lock);
+ atomic_inc(&eb->spinning_readers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_readers))
+ cond_wake_up_nomb(&eb->read_lock_wq);
+}
+
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
{
/*
* no lock is required. The lock owner may change if
@@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
-
- if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_writers) != 1);
- write_lock(&eb->lock);
- WARN_ON(atomic_read(&eb->spinning_writers));
- atomic_inc(&eb->spinning_writers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_writers))
- cond_wake_up_nomb(&eb->write_lock_wq);
- } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
- BUG_ON(atomic_read(&eb->blocking_readers) == 0);
- read_lock(&eb->lock);
- atomic_inc(&eb->spinning_readers);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- }
+ BUG_ON(atomic_read(&eb->blocking_writers) != 1);
+ write_lock(&eb->lock);
+ WARN_ON(atomic_read(&eb->spinning_writers));
+ atomic_inc(&eb->spinning_writers);
+ /* atomic_dec_and_test implies a barrier */
+ if (atomic_dec_and_test(&eb->blocking_writers))
+ cond_wake_up_nomb(&eb->write_lock_wq);
}
/*
@@ -232,16 +237,9 @@ again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_readers)) {
+ if (atomic_read(&eb->blocking_readers) ||
+ atomic_read(&eb->blocking_writers)) {
write_unlock(&eb->lock);
- wait_event(eb->read_lock_wq,
- atomic_read(&eb->blocking_readers) == 0);
- goto again;
- }
- if (atomic_read(&eb->blocking_writers)) {
- write_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
goto again;
}
WARN_ON(atomic_read(&eb->spinning_writers));
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 29135def468e..595014f64830 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
-void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
+void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_read(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking_write(struct extent_buffer *eb);
void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
@@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
-static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
-}
-
-static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
-{
- btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
-}
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 90639140439f..579d53ae256f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -61,6 +61,28 @@ struct workspace {
struct list_head list;
};
+static struct workspace_manager wsm;
+
+static void lzo_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
+}
+
+static void lzo_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *lzo_get_workspace(unsigned int level)
+{
+ return btrfs_get_workspace(&wsm, level);
+}
+
+static void lzo_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(void)
+static struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -485,11 +507,16 @@ out:
return ret;
}
-static void lzo_set_level(struct list_head *ws, unsigned int type)
+static unsigned int lzo_set_level(unsigned int level)
{
+ return 0;
}
const struct btrfs_compress_op btrfs_lzo_compress = {
+ .init_workspace_manager = lzo_init_workspace_manager,
+ .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
+ .get_workspace = lzo_get_workspace,
+ .put_workspace = lzo_put_workspace,
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e473a998219..c1cd5558a646 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1546,12 +1546,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ } else {
+ if (record->data_rsv && !entry->data_rsv) {
+ entry->data_rsv = record->data_rsv;
+ entry->data_rsv_refroot =
+ record->data_rsv_refroot;
+ }
return 1;
+ }
}
rb_link_node(&record->node, parent_node, p);
@@ -1597,7 +1603,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
- record = kmalloc(sizeof(*record), gfp_flag);
+ record = kzalloc(sizeof(*record), gfp_flag);
if (!record)
return -ENOMEM;
@@ -1832,7 +1838,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
}
@@ -1973,7 +1979,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
need_cleanup = true;
}
@@ -2017,86 +2023,30 @@ out:
return ret;
}
-/*
- * Inform qgroup to trace subtree swap used in balance.
- *
- * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
- * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
- *
- * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
- * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
- * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
- * the counterpart of the tree block, then mark both tree blocks as qgroup dirty,
- * and skip all tree blocks whose generation is smaller than last_snapshot.
- *
- * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
- * which could be the cause of very slow balance if the file tree is large.
- *
- * @src_parent, @src_slot: pointer to src (file tree) eb.
- * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
- */
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot)
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct extent_buffer *src_eb,
+ struct extent_buffer *dst_eb,
+ u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *dst_path = NULL;
- struct btrfs_key first_key;
- struct extent_buffer *src_eb = NULL;
- struct extent_buffer *dst_eb = NULL;
- bool trace_leaf = false;
- u64 child_gen;
- u64 child_bytenr;
int level;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
- /* Check parameter order */
- if (btrfs_node_ptr_generation(src_parent, src_slot) >
- btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ /* Wrong parameter order */
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
- btrfs_node_ptr_generation(src_parent, src_slot),
- btrfs_node_ptr_generation(dst_parent, dst_slot));
+ btrfs_header_generation(src_eb),
+ btrfs_header_generation(dst_eb));
return -EUCLEAN;
}
- /*
- * Only trace leaf if we're relocating data block groups, this could
- * reduce tons of data extents tracing for meta/sys bg relocation.
- */
- if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
- trace_leaf = true;
- /* Read out real @src_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
- child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
- btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
-
- src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(src_parent) - 1, &first_key);
- if (IS_ERR(src_eb)) {
- ret = PTR_ERR(src_eb);
- goto out;
- }
-
- /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
- child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
- child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
- btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
-
- dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
- btrfs_header_level(dst_parent) - 1, &first_key);
- if (IS_ERR(dst_eb)) {
- ret = PTR_ERR(dst_eb);
- goto out;
- }
-
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
- ret = -EINVAL;
+ ret = -EIO;
goto out;
}
@@ -2106,14 +2056,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = -ENOMEM;
goto out;
}
-
/* For dst_path */
extent_buffer_get(dst_eb);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
- /* Do the generation-aware breadth-first search */
+ /* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
level, last_snapshot, trace_leaf);
if (ret < 0)
@@ -2121,8 +2070,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
ret = 0;
out:
- free_extent_buffer(src_eb);
- free_extent_buffer(dst_eb);
btrfs_free_path(dst_path);
if (ret < 0)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2207,7 +2154,7 @@ walk_down:
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
@@ -2576,6 +2523,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
goto cleanup;
}
+ /* Free the reserved data space */
+ btrfs_qgroup_free_refroot(fs_info,
+ record->data_rsv_refroot,
+ record->data_rsv,
+ BTRFS_QGROUP_RSV_DATA);
/*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
@@ -2842,16 +2794,15 @@ out:
/*
* Two limits to commit transaction in advance.
*
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
* For SIZE, it will be in byte unit as threshold.
*/
-#define QGROUP_PERTRANS_RATIO 32
-#define QGROUP_PERTRANS_SIZE SZ_32M
+#define QGROUP_FREE_RATIO 32
+#define QGROUP_FREE_SIZE SZ_32M
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 limit;
+ u64 free;
u64 threshold;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2870,20 +2821,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
*/
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
- limit = qg->max_excl;
- else
- limit = qg->max_rfer;
- threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
- qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
- QGROUP_PERTRANS_RATIO;
- threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+ if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+ threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ } else {
+ free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+ threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+ QGROUP_FREE_SIZE);
+ }
/*
* Use transaction_kthread to commit transaction, so we no
* longer need to bother nested transaction nor lock context.
*/
- if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+ if (free < threshold)
btrfs_commit_transaction_locksafe(fs_info);
}
@@ -2959,7 +2911,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
@@ -3026,7 +2977,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
@@ -3783,3 +3733,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
extent_changeset_release(&changeset);
}
+
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
+{
+ int i;
+
+ spin_lock_init(&swapped_blocks->lock);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ swapped_blocks->blocks[i] = RB_ROOT;
+ swapped_blocks->swapped = false;
+}
+
+/*
+ * Delete all swapped blocks record of @root.
+ * Every record here means we skipped a full subtree scan for qgroup.
+ *
+ * Gets called when committing one transaction.
+ */
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
+{
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
+ int i;
+
+ swapped_blocks = &root->swapped_blocks;
+
+ spin_lock(&swapped_blocks->lock);
+ if (!swapped_blocks->swapped)
+ goto out;
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
+ struct btrfs_qgroup_swapped_block *entry;
+ struct btrfs_qgroup_swapped_block *next;
+
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
+ node)
+ kfree(entry);
+ swapped_blocks->blocks[i] = RB_ROOT;
+ }
+ swapped_blocks->swapped = false;
+out:
+ spin_unlock(&swapped_blocks->lock);
+}
+
+/*
+ * Add subtree roots record into @subvol_root.
+ *
+ * @subvol_root: tree root of the subvolume tree get swapped
+ * @bg: block group under balance
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
+ * BOTH POINTERS ARE BEFORE TREE SWAP
+ * @last_snapshot: last snapshot generation of the subvolume tree
+ */
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct rb_node **cur;
+ struct rb_node *parent = NULL;
+ int level = btrfs_header_level(subvol_parent) - 1;
+ int ret = 0;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
+ __func__,
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
+ return -EUCLEAN;
+ }
+
+ block = kmalloc(sizeof(*block), GFP_NOFS);
+ if (!block) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * @reloc_parent/slot is still before swap, while @block is going to
+ * record the bytenr after swap, so we do the swap here.
+ */
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
+ reloc_slot);
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
+ subvol_slot);
+ block->last_snapshot = last_snapshot;
+ block->level = level;
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ block->trace_leaf = true;
+ else
+ block->trace_leaf = false;
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
+
+ /* Insert @block into @blocks */
+ spin_lock(&blocks->lock);
+ cur = &blocks->blocks[level].rb_node;
+ while (*cur) {
+ struct btrfs_qgroup_swapped_block *entry;
+
+ parent = *cur;
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
+ node);
+
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
+ cur = &(*cur)->rb_left;
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
+ cur = &(*cur)->rb_right;
+ } else {
+ if (entry->subvol_generation !=
+ block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation !=
+ block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found.
+ * Shouldn't happen.
+ *
+ * Marking qgroup inconsistent should be enough
+ * for end users.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ ret = -EEXIST;
+ }
+ kfree(block);
+ goto out_unlock;
+ }
+ }
+ rb_link_node(&block->node, parent, cur);
+ rb_insert_color(&block->node, &blocks->blocks[level]);
+ blocks->swapped = true;
+out_unlock:
+ spin_unlock(&blocks->lock);
+out:
+ if (ret < 0)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *subvol_eb)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+ struct btrfs_qgroup_swapped_block *block;
+ struct extent_buffer *reloc_eb = NULL;
+ struct rb_node *node;
+ bool found = false;
+ bool swapped = false;
+ int level = btrfs_header_level(subvol_eb);
+ int ret = 0;
+ int i;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+ return 0;
+
+ spin_lock(&blocks->lock);
+ if (!blocks->swapped) {
+ spin_unlock(&blocks->lock);
+ return 0;
+ }
+ node = blocks->blocks[level].rb_node;
+
+ while (node) {
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+ if (block->subvol_bytenr < subvol_eb->start) {
+ node = node->rb_left;
+ } else if (block->subvol_bytenr > subvol_eb->start) {
+ node = node->rb_right;
+ } else {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spin_unlock(&blocks->lock);
+ goto out;
+ }
+ /* Found one, remove it from @blocks first and update blocks->swapped */
+ rb_erase(&block->node, &blocks->blocks[level]);
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+ swapped = true;
+ break;
+ }
+ }
+ blocks->swapped = swapped;
+ spin_unlock(&blocks->lock);
+
+ /* Read out reloc subtree root */
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ block->reloc_generation, block->level,
+ &block->first_key);
+ if (IS_ERR(reloc_eb)) {
+ ret = PTR_ERR(reloc_eb);
+ reloc_eb = NULL;
+ goto free_out;
+ }
+ if (!extent_buffer_uptodate(reloc_eb)) {
+ ret = -EIO;
+ goto free_out;
+ }
+
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+ block->last_snapshot, block->trace_leaf);
+free_out:
+ kfree(block);
+ free_extent_buffer(reloc_eb);
+out:
+ if (ret < 0) {
+ btrfs_err_rl(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 20c6bd5fa701..46ba7bd2961c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
+#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include "ulist.h"
#include "delayed-ref.h"
@@ -38,6 +40,66 @@
*/
/*
+ * Special performance optimization for balance.
+ *
+ * For balance, we need to swap subtree of subvolume and reloc trees.
+ * In theory, we need to trace all subtree blocks of both subvolume and reloc
+ * trees, since their owner has changed during such swap.
+ *
+ * However since balance has ensured that both subtrees are containing the
+ * same contents and have the same tree structures, such swap won't cause
+ * qgroup number change.
+ *
+ * But there is a race window between subtree swap and transaction commit,
+ * during that window, if we increase/decrease tree level or merge/split tree
+ * blocks, we still need to trace the original subtrees.
+ *
+ * So for balance, we use a delayed subtree tracing, whose workflow is:
+ *
+ * 1) Record the subtree root block get swapped.
+ *
+ * During subtree swap:
+ * O = Old tree blocks
+ * N = New tree blocks
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * NA OB OA OB
+ * / | | \ / | | \
+ * NC ND OE OF OC OD OE OF
+ *
+ * In this case, NA and OA are going to be swapped, record (NA, OA) into
+ * subvolume tree X.
+ *
+ * 2) After subtree swap.
+ * reloc tree subvolume tree X
+ * Root Root
+ * / \ / \
+ * OA OB NA OB
+ * / | | \ / | | \
+ * OC OD OE OF NC ND OE OF
+ *
+ * 3a) COW happens for OB
+ * If we are going to COW tree block OB, we check OB's bytenr against
+ * tree X's swapped_blocks structure.
+ * If it doesn't fit any, nothing will happen.
+ *
+ * 3b) COW happens for NA
+ * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
+ * Then we do subtree scan on both subtrees OA and NA.
+ * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
+ *
+ * Then no matter what we do to subvolume tree X, qgroup numbers will
+ * still be correct.
+ * Then NA's record gets removed from X's swapped_blocks.
+ *
+ * 4) Transaction commit
+ * Any record in X's swapped_blocks gets removed, since there is no
+ * modification to the swapped subtrees, no need to trigger heavy qgroup
+ * subtree rescan for them.
+ */
+
+/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
@@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
+
+ /*
+ * For qgroup reserved data space freeing.
+ *
+ * @data_rsv_refroot and @data_rsv will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * transaction commit time.
+ */
+ u32 data_rsv; /* reserved data space needs to be freed */
+ u64 data_rsv_refroot; /* which root the reserved data belongs to */
struct ulist *old_roots;
};
+struct btrfs_qgroup_swapped_block {
+ struct rb_node node;
+
+ int level;
+ bool trace_leaf;
+
+ /* bytenr/generation of the tree block in subvolume tree after swap */
+ u64 subvol_bytenr;
+ u64 subvol_generation;
+
+ /* bytenr/generation of the tree block in reloc tree after swap */
+ u64 reloc_bytenr;
+ u64 reloc_generation;
+
+ u64 last_snapshot;
+ struct btrfs_key first_key;
+};
+
/*
* Qgroup reservation types:
*
@@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
-
-int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *bg_cache,
- struct extent_buffer *src_parent, int src_slot,
- struct extent_buffer *dst_parent, int dst_slot,
- u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
-static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes)
-{
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return;
- trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
- BTRFS_QGROUP_RSV_DATA);
-}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
@@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+/* btrfs_qgroup_swapped_blocks related functions */
+void btrfs_qgroup_init_swapped_blocks(
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks);
+
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
+ struct btrfs_root *subvol_root,
+ struct btrfs_block_group_cache *bg,
+ struct extent_buffer *subvol_parent, int subvol_slot,
+ struct extent_buffer *reloc_parent, int reloc_slot,
+ u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *eb);
+
#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index c3557c12656b..d09b6cdb785a 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
return -EIO;
}
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
@@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 272b287f8cf0..ddf028509931 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -162,6 +162,8 @@ struct reloc_control {
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* list of subvolume trees that get relocated */
+ struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
@@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (!root->reloc_root)
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+ !root->reloc_root)
goto out;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
- root->reloc_root = NULL;
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
__del_reloc_root(reloc_root);
}
@@ -1773,7 +1777,7 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1786,7 +1790,7 @@ again:
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1802,6 +1806,8 @@ again:
BUG_ON(level < lowest_level);
ret = btrfs_bin_search(parent, &key, level, &slot);
+ if (ret < 0)
+ break;
if (ret && slot > 0)
slot--;
@@ -1852,7 +1858,7 @@ again:
slot, &eb);
BUG_ON(ret);
}
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1885,15 +1891,18 @@ again:
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
- * and tree block numbers, if current trans doesn't free
- * data reloc tree inode.
+ *
+ * We don't scan the subtree right now, but only record
+ * the swapped tree blocks.
+ * The real subtree rescan is delayed until we have new
+ * CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
- parent, slot, path->nodes[level],
- path->slots[level], last_snapshot);
+ ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ rc->block_group, parent, slot,
+ path->nodes[level], path->slots[level],
+ last_snapshot);
if (ret < 0)
break;
-
/*
* swap blocks in fs tree and reloc tree.
*/
@@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level,
}
/*
+ * Insert current subvolume into reloc_control::dirty_subvol_roots
+ */
+static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root = root->reloc_root;
+ struct btrfs_root_item *reloc_root_item;
+
+ /* @root must be a subvolume tree root with a valid reloc tree */
+ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(reloc_root);
+
+ reloc_root_item = &reloc_root->root_item;
+ memset(&reloc_root_item->drop_progress, 0,
+ sizeof(reloc_root_item->drop_progress));
+ reloc_root_item->drop_level = 0;
+ btrfs_set_root_refs(reloc_root_item, 0);
+ btrfs_update_reloc_root(trans, root);
+
+ if (list_empty(&root->reloc_dirty_list)) {
+ btrfs_grab_fs_root(root);
+ list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
+ }
+}
+
+static int clean_dirty_subvols(struct reloc_control *rc)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *next;
+ int ret = 0;
+
+ list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
+ reloc_dirty_list) {
+ struct btrfs_root *reloc_root = root->reloc_root;
+
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ list_del_init(&root->reloc_dirty_list);
+ root->reloc_root = NULL;
+ if (reloc_root) {
+ int ret2;
+
+ ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
+ btrfs_put_fs_root(root);
+ }
+ return ret;
+}
+
+/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
*/
@@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
struct btrfs_trans_handle *trans = NULL;
@@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (err == 0) {
- memset(&root_item->drop_progress, 0,
- sizeof(root_item->drop_progress));
- root_item->drop_level = 0;
- btrfs_set_root_refs(root_item, 0);
- btrfs_update_reloc_root(trans, root);
- }
+ if (err == 0)
+ insert_dirty_subvol(trans, rc, root);
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -2410,14 +2465,6 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
-
- ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
- if (ret < 0) {
- if (list_empty(&reloc_root->root_list))
- list_add_tail(&reloc_root->root_list,
- &reloc_roots);
- goto out;
- }
}
if (found) {
@@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key,
upper->level, &slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
&slot);
+ if (ret < 0) {
+ err = ret;
+ goto next;
+ }
BUG_ON(ret);
}
@@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking(eb);
+ btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
@@ -4079,6 +4134,9 @@ restart:
goto out_free;
}
btrfs_commit_transaction(trans);
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
@@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
+ INIT_LIST_HEAD(&rc->dirty_subvol_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(&rc->processed_blocks, NULL);
@@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out_free;
}
err = btrfs_commit_transaction(trans);
+
+ ret = clean_dirty_subvols(rc);
+ if (ret < 0 && !err)
+ err = ret;
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda0682928..0d2b957ca3a3 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
uuid_le uuid;
- int len;
+ u32 len;
int need_reset = 0;
len = btrfs_item_size_nr(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
- min_t(int, len, (int)sizeof(*item)));
+ min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dcd36d7b849..a99588536c79 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
+ INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
@@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
- INIT_LIST_HEAD(&sctx->csum_list);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
- if (fs_info->scrub_workers_refcnt == 0) {
+ lockdep_assert_held(&fs_info->scrub_lock);
+
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL);
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
+ ASSERT(fs_info->scrub_wr_completion_workers == NULL);
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
+ ASSERT(fs_info->scrub_parity_workers == NULL);
fs_info->scrub_parity_workers =
btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
+
+ refcount_set(&fs_info->scrub_workers_refcnt, 1);
+ } else {
+ refcount_inc(&fs_info->scrub_workers_refcnt);
}
- ++fs_info->scrub_workers_refcnt;
return 0;
fail_scrub_parity_workers:
@@ -3770,16 +3778,6 @@ fail_scrub_workers:
return -ENOMEM;
}
-static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
-{
- if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_destroy_workqueue(fs_info->scrub_workers);
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
- btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
- }
- WARN_ON(fs_info->scrub_workers_refcnt < 0);
-}
-
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
@@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return PTR_ERR(sctx);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
* kick off writing super in log tree sync.
@@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
+ if (!is_dev_replace)
+ btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
+ ret ? "not finished" : "finished", devid, ret);
+
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- scrub_workers_put(fs_info);
+ if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ }
mutex_unlock(&fs_info->scrub_lock);
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
scrub_put_ctx(sctx);
return ret;
@@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a3f122dd61f..120e4340792a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
- btrfs_compress_str2level(args[0].from);
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZLIB,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
- } else if (strcmp(args[0].from, "zstd") == 0) {
+ } else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
+ info->compress_level =
+ btrfs_compress_str2level(
+ BTRFS_COMPRESS_ZSTD,
+ args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
+ case BTRFS_IOC_FORGET_DEV:
+ ret = btrfs_forget_devices(vol->name);
+ break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4ec2b660d014..acdad6d658f5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
+ btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
@@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
@@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking(old);
+ btrfs_set_lock_blocking_write(old);
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
@@ -1943,8 +1943,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->delayed_refs.flushing = 1;
smp_wmb();
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans);
+ btrfs_create_pending_block_groups(trans);
ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3c0987ab587d..5f9e2dd413af 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking(root_node);
+ btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ac232b3d6d7e..f06454a55e00 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#define LOG_INODE_ALL 0
#define LOG_INODE_EXISTS 1
#define LOG_OTHER_INODE 2
+#define LOG_OTHER_INODE_ALL 3
/*
* directory trouble cases
@@ -1330,6 +1331,67 @@ out:
return ret;
}
+static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct inode *dir, struct inode *inode, const char *name,
+ int namelen, u64 ref_index)
+{
+ struct btrfs_dir_item *dir_item;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct inode *other_inode = NULL;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_item = btrfs_lookup_dir_item(NULL, root, path,
+ btrfs_ino(BTRFS_I(dir)),
+ name, namelen, 0);
+ if (!dir_item) {
+ btrfs_release_path(path);
+ goto add_link;
+ } else if (IS_ERR(dir_item)) {
+ ret = PTR_ERR(dir_item);
+ goto out;
+ }
+
+ /*
+ * Our inode's dentry collides with the dentry of another inode which is
+ * in the log but not yet processed since it has a higher inode number.
+ * So delete that other dentry.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
+ btrfs_release_path(path);
+ other_inode = read_one_inode(root, key.objectid);
+ if (!other_inode) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
+ if (ret)
+ goto out;
+ /*
+ * If we dropped the link count to 0, bump it so that later the iput()
+ * on the inode will not free it. We will fixup the link count later.
+ */
+ if (other_inode->i_nlink == 0)
+ inc_nlink(other_inode);
+
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
+ goto out;
+add_link:
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
+out:
+ iput(other_inode);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
goto out;
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen, 0, ref_index);
+ ret = add_link(trans, root, dir, inode, name, namelen,
+ ref_index);
if (ret)
goto out;
@@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ btrfs_set_lock_blocking_write(next);
clean_tree_block(fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -3706,6 +3767,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
&start_slot);
+ if (ret < 0)
+ break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
@@ -4717,7 +4780,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
- u64 *other_ino)
+ u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
@@ -4780,8 +4843,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
- ret = 1;
- *other_ino = di_key.objectid;
+ if (di_key.objectid != key->objectid) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ *other_parent = parent;
+ } else {
+ ret = 0;
+ }
} else {
ret = -EAGAIN;
}
@@ -4801,6 +4869,144 @@ out:
return ret;
}
+struct btrfs_ino_list {
+ u64 ino;
+ u64 parent;
+ struct list_head list;
+};
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ u64 ino, u64 parent)
+{
+ struct btrfs_ino_list *ino_elem;
+ LIST_HEAD(inode_list);
+ int ret = 0;
+
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+
+ while (!list_empty(&inode_list)) {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_key key;
+ struct inode *inode;
+
+ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
+ list);
+ ino = ino_elem->ino;
+ parent = ino_elem->parent;
+ list_del(&ino_elem->list);
+ kfree(ino_elem);
+ if (ret)
+ continue;
+
+ btrfs_release_path(path);
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ /*
+ * If the other inode that had a conflicting dir entry was
+ * deleted in the current transaction, we need to log its parent
+ * directory.
+ */
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ if (ret == -ENOENT) {
+ key.objectid = parent;
+ inode = btrfs_iget(fs_info->sb, &key, root,
+ NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ } else {
+ ret = btrfs_log_inode(trans, root,
+ BTRFS_I(inode),
+ LOG_OTHER_INODE_ALL,
+ 0, LLONG_MAX, ctx);
+ iput(inode);
+ }
+ }
+ continue;
+ }
+ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
+ * well because during a rename we pin the log and update the
+ * log with the new name before we unpin it.
+ */
+ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+ LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ if (ret) {
+ iput(inode);
+ continue;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ iput(inode);
+ continue;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)) {
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_check_ref_name_override(leaf, slot, &key,
+ BTRFS_I(inode), &other_ino,
+ &other_parent);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem) {
+ ret = -ENOMEM;
+ break;
+ }
+ ino_elem->ino = other_ino;
+ ino_elem->parent = other_parent;
+ list_add_tail(&ino_elem->list, &inode_list);
+ ret = 0;
+ }
+ path->slots[0]++;
+ }
+ iput(inode);
+ }
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4840,6 +5046,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
+ bool recursive_logging = false;
path = btrfs_alloc_path();
if (!path)
@@ -4885,8 +5092,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return ret;
}
- if (inode_only == LOG_OTHER_INODE) {
- inode_only = LOG_INODE_EXISTS;
+ if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
+ recursive_logging = true;
+ if (inode_only == LOG_OTHER_INODE)
+ inode_only = LOG_INODE_EXISTS;
+ else
+ inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
@@ -4981,20 +5192,19 @@ again:
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid) {
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
+ u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], &min_key, inode,
- &other_ino);
+ &other_ino, &other_parent);
if (ret < 0) {
err = ret;
goto out_unlock;
} else if (ret > 0 && ctx &&
other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- struct btrfs_key inode_key;
- struct inode *other_inode;
-
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -5010,43 +5220,13 @@ again:
goto out_unlock;
}
ins_nr = 0;
- btrfs_release_path(path);
- inode_key.objectid = other_ino;
- inode_key.type = BTRFS_INODE_ITEM_KEY;
- inode_key.offset = 0;
- other_inode = btrfs_iget(fs_info->sb,
- &inode_key, root,
- NULL);
- /*
- * If the other inode that had a conflicting dir
- * entry was deleted in the current transaction,
- * we don't need to do more work nor fallback to
- * a transaction commit.
- */
- if (other_inode == ERR_PTR(-ENOENT)) {
- goto next_key;
- } else if (IS_ERR(other_inode)) {
- err = PTR_ERR(other_inode);
- goto out_unlock;
- }
- /*
- * We are safe logging the other inode without
- * acquiring its i_mutex as long as we log with
- * the LOG_INODE_EXISTS mode. We're safe against
- * concurrent renames of the other inode as well
- * because during a rename we pin the log and
- * update the log with the new name before we
- * unpin it.
- */
- err = btrfs_log_inode(trans, root,
- BTRFS_I(other_inode),
- LOG_OTHER_INODE, 0, LLONG_MAX,
- ctx);
- iput(other_inode);
+
+ err = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
if (err)
goto out_unlock;
- else
- goto next_key;
+ btrfs_release_path(path);
+ goto next_key;
}
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 15561926ab32..9024eee889b9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void)
return dev;
}
-/*
- * Find a device specified by @devid or @uuid in the list of @fs_devices, or
- * return NULL.
- *
- * If devid and uuid are both specified, the match must be exact, otherwise
- * only devid is used.
- */
-static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, const u8 *uuid)
-{
- struct btrfs_device *dev;
-
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- if (dev->devid == devid &&
- (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
- return dev;
- }
- }
- return NULL;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+static bool device_path_matched(const char *path, struct btrfs_device *device)
+{
+ int found;
+
+ rcu_read_lock();
+ found = strcmp(rcu_str_deref(device->name), path);
+ rcu_read_unlock();
+
+ return found == 0;
+}
+
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
@@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work)
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
+ * Return: 0 for success or if @path is NULL.
+ * -EBUSY if @path is a mounted device.
+ * -ENOENT if @path does not match any device in the list.
*/
-static void btrfs_free_stale_devices(const char *path,
+static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
+ int ret = 0;
+
+ if (path)
+ ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
- mutex_lock(&fs_devices->device_list_mutex);
- if (fs_devices->opened) {
- mutex_unlock(&fs_devices->device_list_mutex);
- continue;
- }
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
- int not_found = 0;
-
if (skip_device && skip_device == device)
continue;
if (path && !device->name)
continue;
-
- rcu_read_lock();
- if (path)
- not_found = strcmp(rcu_str_deref(device->name),
- path);
- rcu_read_unlock();
- if (not_found)
+ if (path && !device_path_matched(path, device))
continue;
+ if (fs_devices->opened) {
+ /* for an already deleted device return 0 */
+ if (path && ret != 0)
+ ret = -EBUSY;
+ break;
+ }
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
+ ret = 0;
if (fs_devices->num_devices == 0)
break;
}
mutex_unlock(&fs_devices->device_list_mutex);
+
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
+
+ return ret;
}
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
@@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
- device = find_device(fs_devices, devid,
- disk_super->dev_item.uuid);
+ device = btrfs_find_device(fs_devices, devid,
+ disk_super->dev_item.uuid, NULL, false);
/*
* If this disk has been pulled into an fs devices created by
@@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
- /* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
@@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
return 0;
}
+int btrfs_forget_devices(const char *path)
+{
+ int ret;
+
+ mutex_lock(&uuid_mutex);
+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ mutex_unlock(&uuid_mutex);
+
+ return ret;
+}
+
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
@@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path(
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
- device = btrfs_find_device(fs_info, devid, dev_uuid,
- disk_super->metadata_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->metadata_uuid, true);
else
- device = btrfs_find_device(fs_info, devid,
- dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ disk_super->fsid, true);
brelse(bh);
if (!device)
@@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path(
return device;
}
-static struct btrfs_device *btrfs_find_device_missing_or_by_path(
- struct btrfs_fs_info *fs_info, const char *device_path)
-{
- struct btrfs_device *device = NULL;
- if (strcmp(device_path, "missing") == 0) {
- struct list_head *devices;
- struct btrfs_device *tmp;
-
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(tmp, devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &tmp->dev_state) && !tmp->bdev) {
- device = tmp;
- break;
- }
- }
-
- if (!device)
- return ERR_PTR(-ENOENT);
- } else {
- device = btrfs_find_device_by_path(fs_info, device_path);
- }
-
- return device;
-}
-
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
- struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
+ struct btrfs_fs_info *fs_info, u64 devid,
+ const char *device_path)
{
struct btrfs_device *device;
if (devid) {
- device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
+ NULL, true);
if (!device)
return ERR_PTR(-ENOENT);
- } else {
- if (!devpath || !devpath[0])
- return ERR_PTR(-EINVAL);
- device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
+ return device;
}
- return device;
+
+ if (!device_path || !device_path[0])
+ return ERR_PTR(-EINVAL);
+
+ if (strcmp(device_path, "missing") == 0) {
+ /* Find first missing device */
+ list_for_each_entry(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &device->dev_state) && !device->bdev)
+ return device;
+ }
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_find_device_by_path(fs_info, device_path);
}
/*
@@ -2563,7 +2556,8 @@ next_slot:
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
return BLK_STS_OK;
}
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid)
+/*
+ * Find a device specified by @devid or @uuid in the list of @fs_devices, or
+ * return NULL.
+ *
+ * If devid and uuid are both specified, the match must be exact, otherwise
+ * only devid is used.
+ *
+ * If @seed is true, traverse through the seed devices.
+ */
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid,
+ bool seed)
{
struct btrfs_device *device;
- struct btrfs_fs_devices *cur_devices;
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
+ while (fs_devices) {
if (!fsid ||
- !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- device = find_device(cur_devices, devid, uuid);
- if (device)
- return device;
+ !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices,
+ dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
}
- cur_devices = cur_devices->seed;
+ if (seed)
+ fs_devices = fs_devices->seed;
+ else
+ return NULL;
}
return NULL;
}
@@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
}
if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != 1)) {
btrfs_err(fs_info,
@@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
- map->stripes[i].dev = btrfs_find_device(fs_info, devid,
- uuid, NULL);
+ map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
+ devid, uuid, NULL, true);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
return PTR_ERR(fs_devices);
}
- device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
+ fs_uuid, true);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
+ true);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
@@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = find_device(fs_info->fs_devices->seed, devid, NULL);
+ dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
+ NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ed806649a473..3ad9d58d1b66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
@@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
- u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
+ u64 devid, u8 *uuid, u8 *fsid, bool seed);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 970ff3e35bb3..b86b7ad6b900 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -27,6 +27,33 @@ struct workspace {
int level;
};
+static struct workspace_manager wsm;
+
+static void zlib_init_workspace_manager(void)
+{
+ btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
+}
+
+static void zlib_cleanup_workspace_manager(void)
+{
+ btrfs_cleanup_workspace_manager(&wsm);
+}
+
+static struct list_head *zlib_get_workspace(unsigned int level)
+{
+ struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ workspace->level = level;
+
+ return ws;
+}
+
+static void zlib_put_workspace(struct list_head *ws)
+{
+ btrfs_put_workspace(&wsm, ws);
+}
+
static void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(void)
+static struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -390,18 +418,19 @@ next:
return ret;
}
-static void zlib_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zlib_set_level(unsigned int level)
{
- struct workspace *workspace = list_entry(ws, struct workspace, list);
- unsigned level = (type & 0xF0) >> 4;
-
- if (level > 9)
- level = 9;
+ if (!level)
+ return BTRFS_ZLIB_DEFAULT_LEVEL;
- workspace->level = level > 0 ? level : 3;
+ return min_t(unsigned int, level, 9);
}
const struct btrfs_compress_op btrfs_zlib_compress = {
+ .init_workspace_manager = zlib_init_workspace_manager,
+ .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
+ .get_workspace = zlib_get_workspace,
+ .put_workspace = zlib_put_workspace,
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index af6ec59972f5..3e418a3aeb11 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -6,25 +6,31 @@
*/
#include <linux/bio.h>
+#include <linux/bitmap.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/pagemap.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/zstd.h>
#include "compression.h"
+#include "ctree.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MAX_LEVEL 15
+/* 307s to avoid pathologically clashing with transaction commit */
+#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level,
+ size_t src_len)
{
- ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
- src_len, 0);
+ ZSTD_parameters params = ZSTD_getParams(level, src_len, 0);
if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
@@ -36,11 +42,290 @@ struct workspace {
void *mem;
size_t size;
char *buf;
+ unsigned int level;
+ unsigned int req_level;
+ unsigned long last_used; /* jiffies */
struct list_head list;
+ struct list_head lru_list;
ZSTD_inBuffer in_buf;
ZSTD_outBuffer out_buf;
};
+/*
+ * Zstd Workspace Management
+ *
+ * Zstd workspaces have different memory requirements depending on the level.
+ * The zstd workspaces are managed by having individual lists for each level
+ * and a global lru. Forward progress is maintained by protecting a max level
+ * workspace.
+ *
+ * Getting a workspace is done by using the bitmap to identify the levels that
+ * have available workspaces and scans up. This lets us recycle higher level
+ * workspaces because of the monotonic memory guarantee. A workspace's
+ * last_used is only updated if it is being used by the corresponding memory
+ * level. Putting a workspace involves adding it back to the appropriate places
+ * and adding it back to the lru if necessary.
+ *
+ * A timer is used to reclaim workspaces if they have not been used for
+ * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around.
+ * The upper bound is provided by the workqueue limit which is 2 (percpu limit).
+ */
+
+struct zstd_workspace_manager {
+ const struct btrfs_compress_op *ops;
+ spinlock_t lock;
+ struct list_head lru_list;
+ struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL];
+ unsigned long active_map;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+static struct zstd_workspace_manager wsm;
+
+static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL];
+
+static inline struct workspace *list_to_workspace(struct list_head *list)
+{
+ return container_of(list, struct workspace, list);
+}
+
+/*
+ * zstd_reclaim_timer_fn - reclaim timer
+ * @t: timer
+ *
+ * This scans the lru_list and attempts to reclaim any workspace that hasn't
+ * been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ */
+static void zstd_reclaim_timer_fn(struct timer_list *timer)
+{
+ unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
+ struct list_head *pos, *next;
+
+ spin_lock(&wsm.lock);
+
+ if (list_empty(&wsm.lru_list)) {
+ spin_unlock(&wsm.lock);
+ return;
+ }
+
+ list_for_each_prev_safe(pos, next, &wsm.lru_list) {
+ struct workspace *victim = container_of(pos, struct workspace,
+ lru_list);
+ unsigned int level;
+
+ if (time_after(victim->last_used, reclaim_threshold))
+ break;
+
+ /* workspace is in use */
+ if (victim->req_level)
+ continue;
+
+ level = victim->level;
+ list_del(&victim->lru_list);
+ list_del(&victim->list);
+ wsm.ops->free_workspace(&victim->list);
+
+ if (list_empty(&wsm.idle_ws[level - 1]))
+ clear_bit(level - 1, &wsm.active_map);
+
+ }
+
+ if (!list_empty(&wsm.lru_list))
+ mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+
+ spin_unlock(&wsm.lock);
+}
+
+/*
+ * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ *
+ * It is possible based on the level configurations that a higher level
+ * workspace uses less memory than a lower level workspace. In order to reuse
+ * workspaces, this must be made a monotonic relationship. This precomputes
+ * the required memory for each level and enforces the monotonicity between
+ * level and memory required.
+ */
+static void zstd_calc_ws_mem_sizes(void)
+{
+ size_t max_size = 0;
+ unsigned int level;
+
+ for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ ZSTD_parameters params =
+ zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
+ size_t level_size =
+ max_t(size_t,
+ ZSTD_CStreamWorkspaceBound(params.cParams),
+ ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+
+ max_size = max_t(size_t, max_size, level_size);
+ zstd_ws_mem_sizes[level - 1] = max_size;
+ }
+}
+
+static void zstd_init_workspace_manager(void)
+{
+ struct list_head *ws;
+ int i;
+
+ zstd_calc_ws_mem_sizes();
+
+ wsm.ops = &btrfs_zstd_compress;
+ spin_lock_init(&wsm.lock);
+ init_waitqueue_head(&wsm.wait);
+ timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0);
+
+ INIT_LIST_HEAD(&wsm.lru_list);
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++)
+ INIT_LIST_HEAD(&wsm.idle_ws[i]);
+
+ ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
+ if (IS_ERR(ws)) {
+ pr_warn(
+ "BTRFS: cannot preallocate zstd compression workspace\n");
+ } else {
+ set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
+ list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
+ }
+}
+
+static void zstd_cleanup_workspace_manager(void)
+{
+ struct workspace *workspace;
+ int i;
+
+ del_timer(&wsm.timer);
+
+ for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
+ while (!list_empty(&wsm.idle_ws[i])) {
+ workspace = container_of(wsm.idle_ws[i].next,
+ struct workspace, list);
+ list_del(&workspace->list);
+ list_del(&workspace->lru_list);
+ wsm.ops->free_workspace(&workspace->list);
+ }
+ }
+}
+
+/*
+ * zstd_find_workspace - find workspace
+ * @level: compression level
+ *
+ * This iterates over the set bits in the active_map beginning at the requested
+ * compression level. This lets us utilize already allocated workspaces before
+ * allocating a new one. If the workspace is of a larger size, it is used, but
+ * the place in the lru_list and last_used times are not updated. This is to
+ * offer the opportunity to reclaim the workspace in favor of allocating an
+ * appropriately sized one in the future.
+ */
+static struct list_head *zstd_find_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ struct workspace *workspace;
+ int i = level - 1;
+
+ spin_lock(&wsm.lock);
+ for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
+ if (!list_empty(&wsm.idle_ws[i])) {
+ ws = wsm.idle_ws[i].next;
+ workspace = list_to_workspace(ws);
+ list_del_init(ws);
+ /* keep its place if it's a lower level using this */
+ workspace->req_level = level;
+ if (level == workspace->level)
+ list_del(&workspace->lru_list);
+ if (list_empty(&wsm.idle_ws[i]))
+ clear_bit(i, &wsm.active_map);
+ spin_unlock(&wsm.lock);
+ return ws;
+ }
+ }
+ spin_unlock(&wsm.lock);
+
+ return NULL;
+}
+
+/*
+ * zstd_get_workspace - zstd's get_workspace
+ * @level: compression level
+ *
+ * If @level is 0, then any compression level can be used. Therefore, we begin
+ * scanning from 1. We first scan through possible workspaces and then after
+ * attempt to allocate a new workspace. If we fail to allocate one due to
+ * memory pressure, go to sleep waiting for the max level workspace to free up.
+ */
+static struct list_head *zstd_get_workspace(unsigned int level)
+{
+ struct list_head *ws;
+ unsigned int nofs_flag;
+
+ /* level == 0 means we can use any workspace */
+ if (!level)
+ level = 1;
+
+again:
+ ws = zstd_find_workspace(level);
+ if (ws)
+ return ws;
+
+ nofs_flag = memalloc_nofs_save();
+ ws = wsm.ops->alloc_workspace(level);
+ memalloc_nofs_restore(nofs_flag);
+
+ if (IS_ERR(ws)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&wsm.wait, &wait);
+
+ goto again;
+ }
+
+ return ws;
+}
+
+/*
+ * zstd_put_workspace - zstd put_workspace
+ * @ws: list_head for the workspace
+ *
+ * When putting back a workspace, we only need to update the LRU if we are of
+ * the requested compression level. Here is where we continue to protect the
+ * max level workspace or update last_used accordingly. If the reclaim timer
+ * isn't set, it is also set here. Only the max level workspace tries and wakes
+ * up waiting workspaces.
+ */
+static void zstd_put_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_to_workspace(ws);
+
+ spin_lock(&wsm.lock);
+
+ /* A node is only taken off the lru if we are the corresponding level */
+ if (workspace->req_level == workspace->level) {
+ /* Hide a max level workspace from reclaim */
+ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
+ INIT_LIST_HEAD(&workspace->lru_list);
+ } else {
+ workspace->last_used = jiffies;
+ list_add(&workspace->lru_list, &wsm.lru_list);
+ if (!timer_pending(&wsm.timer))
+ mod_timer(&wsm.timer,
+ jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
+ }
+ }
+
+ set_bit(workspace->level - 1, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ workspace->req_level = 0;
+
+ spin_unlock(&wsm.lock);
+
+ if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ cond_wake_up(&wsm.wait);
+}
+
static void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -50,25 +335,25 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(void)
+static struct list_head *zstd_alloc_workspace(unsigned int level)
{
- ZSTD_parameters params =
- zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
struct workspace *workspace;
workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = max_t(size_t,
- ZSTD_CStreamWorkspaceBound(params.cParams),
- ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+ workspace->size = zstd_ws_mem_sizes[level - 1];
+ workspace->level = level;
+ workspace->req_level = level;
+ workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->mem || !workspace->buf)
goto fail;
INIT_LIST_HEAD(&workspace->list);
+ INIT_LIST_HEAD(&workspace->lru_list);
return &workspace->list;
fail:
@@ -95,7 +380,8 @@ static int zstd_compress_pages(struct list_head *ws,
unsigned long len = *total_out;
const unsigned long nr_dest_pages = *out_pages;
unsigned long max_out = nr_dest_pages * PAGE_SIZE;
- ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+ ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
+ len);
*out_pages = 0;
*total_out = 0;
@@ -419,11 +705,19 @@ finish:
return ret;
}
-static void zstd_set_level(struct list_head *ws, unsigned int type)
+static unsigned int zstd_set_level(unsigned int level)
{
+ if (!level)
+ return ZSTD_BTRFS_DEFAULT_LEVEL;
+
+ return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL);
}
const struct btrfs_compress_op btrfs_zstd_compress = {
+ .init_workspace_manager = zstd_init_workspace_manager,
+ .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
+ .get_workspace = zstd_get_workspace,
+ .put_workspace = zstd_put_workspace,
.alloc_workspace = zstd_alloc_workspace,
.free_workspace = zstd_free_workspace,
.compress_pages = zstd_compress_pages,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27ea8de1..f74193da0e09 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ if (list_empty(&ci->i_snap_flush_item))
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 1e11a683f63d..322ce9686bdb 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -47,7 +47,7 @@ static int derive_key_aes(const u8 *master_key,
tfm = NULL;
goto out;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
@@ -257,7 +257,7 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key,
mode->friendly_name,
crypto_skcipher_alg(tfm)->base.cra_driver_name);
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 29c68c5d44d5..95b5e78c22b1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -423,8 +423,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
* debugfs core.
*
* It is your responsibility to protect your struct file_operation
- * methods against file removals by means of debugfs_use_file_start()
- * and debugfs_use_file_finish(). ->open() is still protected by
+ * methods against file removals by means of debugfs_file_get()
+ * and debugfs_file_put(). ->open() is still protected by
* debugfs though.
*
* Any struct file_operations defined by means of
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 76976d6e50f9..c98ad9777ad9 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con)
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
0);
memset(&tv, 0, sizeof(tv));
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
sizeof(tv));
if (result == -EINPROGRESS)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4dd842f72846..f664da55234e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -610,7 +610,8 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm,
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -1590,7 +1591,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
if (*key_size == 0)
*key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..4a0e98d87fcc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -50,10 +50,10 @@
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->wq.lock (spinlock)
+ * 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->wq.lock) because we manipulate objects
+ * We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -85,7 +85,7 @@
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->wq.lock") to have it working,
+ * mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
@@ -182,8 +182,6 @@ struct epitem {
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
- *
- * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
/*
@@ -203,13 +201,16 @@ struct eventpoll {
/* List of ready file descriptors */
struct list_head rdllist;
+ /* Lock which protects rdllist and ovflist */
+ rwlock_t lock;
+
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->wq.lock.
+ * holding ->lock.
*/
struct epitem *ovflist;
@@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ /*
+ * ->ovflist is LIFO, so we have to reverse it in order
+ * to keep in FIFO.
+ */
+ list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
@@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* the ->poll() wait list (delayed after we release the lock).
*/
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
@@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep)
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->wq.lock".
+ * us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
@@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep)
goto free_uid;
mutex_init(&ep->mtx);
+ rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+/**
+ * Adds a new entry to the tail of the list in a lockless way, i.e.
+ * multiple CPUs are allowed to call this function concurrently.
+ *
+ * Beware: it is necessary to prevent any other modifications of the
+ * existing list until all changes are completed, in other words
+ * concurrent list_add_tail_lockless() calls should be protected
+ * with a read lock, where write lock acts as a barrier which
+ * makes sure all list_add_tail_lockless() calls are fully
+ * completed.
+ *
+ * Also an element can be locklessly added to the list only in one
+ * direction i.e. either to the tail either to the head, otherwise
+ * concurrent access will corrupt the list.
+ *
+ * Returns %false if element has been already added to the list, %true
+ * otherwise.
+ */
+static inline bool list_add_tail_lockless(struct list_head *new,
+ struct list_head *head)
+{
+ struct list_head *prev;
+
+ /*
+ * This is simple 'new->next = head' operation, but cmpxchg()
+ * is used in order to detect that same element has been just
+ * added to the list from another CPU: the winner observes
+ * new->next == new.
+ */
+ if (cmpxchg(&new->next, new, head) != new)
+ return false;
+
+ /*
+ * Initially ->next of a new element must be updated with the head
+ * (we are inserting to the tail) and only then pointers are atomically
+ * exchanged. XCHG guarantees memory ordering, thus ->next should be
+ * updated before pointers are actually swapped and pointers are
+ * swapped before prev->next is updated.
+ */
+
+ prev = xchg(&head->prev, new);
+
+ /*
+ * It is safe to modify prev->next and new->prev, because a new element
+ * is added only to the tail and new->next is updated before XCHG.
+ */
+
+ prev->next = new;
+ new->prev = prev;
+
+ return true;
+}
+
+/**
+ * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
+ * i.e. multiple CPUs are allowed to call this function concurrently.
+ *
+ * Returns %false if epi element has been already chained, %true otherwise.
+ */
+static inline bool chain_epi_lockless(struct epitem *epi)
+{
+ struct eventpoll *ep = epi->ep;
+
+ /* Check that the same epi has not been just chained from another CPU */
+ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
+ return false;
+
+ /* Atomically exchange tail */
+ epi->next = xchg(&ep->ovflist, epi);
+
+ return true;
+}
+
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
+ *
+ * This callback takes a read lock in order not to content with concurrent
+ * events from another file descriptors, thus all modifications to ->rdllist
+ * or ->ovflist are lockless. Read lock is paired with the write lock from
+ * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * that lists state is seen correctly.
+ *
+ * Another thing worth to mention is that ep_poll_callback() can be called
+ * concurrently for the same @epi from different CPUs if poll table was inited
+ * with several wait queues entries. Plural wakeup from different CPUs of a
+ * single wait queue is serialized by wq.lock, but the case when multiple wait
+ * queues are used should be detected accordingly. This is detected using
+ * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
- unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
+ unsigned long flags;
int ewake = 0;
- spin_lock_irqsave(&ep->wq.lock, flags);
+ read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = READ_ONCE(ep->ovflist);
- WRITE_ONCE(ep->ovflist, epi);
- if (epi->ws) {
- /*
- * Activate ep->ws since epi->ws may get
- * deactivated at any time.
- */
- __pm_stay_awake(ep->ws);
- }
-
- }
+ if (epi->next == EP_UNACTIVE_PTR &&
+ chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi)) {
- list_add_tail(&epi->rdllink, &ep->rdllist);
+ if (!ep_is_linked(epi) &&
+ list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
@@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->wq.lock, flags);
+ read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
atomic_long_inc(&ep->user->epoll_watches);
@@ -1531,10 +1614,10 @@ error_unregister:
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
@@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->wq.lock while
+ * We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
- * ep->wq.lock).
+ * ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
@@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
- wake_up_locked(&ep->wq);
+ wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
*/
timed_out = 1;
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
goto send_events;
}
diff --git a/fs/exec.c b/fs/exec.c
index fb72d36f7823..2e0033348d8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
if (bytes < 0) {
ret = bytes;
- goto out;
+ goto out_free;
}
if (bytes == 0)
@@ -1189,7 +1189,7 @@ no_thread_group:
flush_itimer_signals();
#endif
- if (atomic_read(&oldsighand->count) != 1) {
+ if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
@@ -1199,7 +1199,7 @@ no_thread_group:
if (!newsighand)
return -ENOMEM;
- atomic_set(&newsighand->count, 1);
+ refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
@@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
/*
* Fill the binprm structure from the inode.
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ * Check permissions, then read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
@@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt);
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- unsigned long old, new;
-
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
- do {
- old = READ_ONCE(mm->flags);
- new = (old & ~MMF_DUMPABLE_MASK) | value;
- } while (cmpxchg(&mm->flags, old, new) != old);
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3b8114def693..13318e255ebf 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask)
return (char *)p - base;
}
-static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
- [EXT2_FT_UNKNOWN] = DT_UNKNOWN,
- [EXT2_FT_REG_FILE] = DT_REG,
- [EXT2_FT_DIR] = DT_DIR,
- [EXT2_FT_CHRDEV] = DT_CHR,
- [EXT2_FT_BLKDEV] = DT_BLK,
- [EXT2_FT_FIFO] = DT_FIFO,
- [EXT2_FT_SOCK] = DT_SOCK,
- [EXT2_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK,
-};
-
static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
{
- umode_t mode = inode->i_mode;
if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
else
de->file_type = 0;
}
@@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
- unsigned char *types = NULL;
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+ bool has_filetype;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
return 0;
- if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE))
- types = ext2_filetype_table;
+ has_filetype =
+ EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE);
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
if (de->inode) {
unsigned char d_type = DT_UNKNOWN;
- if (types && de->file_type < EXT2_FT_MAX)
- d_type = types[de->file_type];
+ if (has_filetype)
+ d_type = fs_ftype_to_dtype(de->file_type);
if (!dir_emit(ctx, de->name, de->name_len,
le32_to_cpu(de->inode),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e770cd100a6a..10ab238de9a6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -604,22 +604,6 @@ struct ext2_dir_entry_2 {
};
/*
- * Ext2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-enum {
- EXT2_FT_UNKNOWN = 0,
- EXT2_FT_REG_FILE = 1,
- EXT2_FT_DIR = 2,
- EXT2_FT_CHRDEV = 3,
- EXT2_FT_BLKDEV = 4,
- EXT2_FT_FIFO = 5,
- EXT2_FT_SOCK = 6,
- EXT2_FT_SYMLINK = 7,
- EXT2_FT_MAX
-};
-
-/*
* EXT2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_evict_inode(struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
+extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int);
extern void ext2_set_inode_flags(struct inode *inode);
extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 28b2609f25c1..39c4772e96c9 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c3d7b7e4975..a0c5ea91fcd4 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
best_desc = desc;
}
}
- if (!best_desc)
- return -1;
return best_group;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e4bb9386c045..c27c27300d95 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode,
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
/*
- * Next look up the indirect map to count the totoal number of
+ * Next look up the indirect map to count the total number of
* direct blocks to allocate for this branch.
*/
count = ext2_blks_to_allocate(partial, indirect_blks,
@@ -1239,6 +1239,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 1);
}
+ /* fall through */
case EXT2_IND_BLOCK:
nr = i_data[EXT2_DIND_BLOCK];
if (nr) {
@@ -1246,6 +1247,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 2);
}
+ /* fall through */
case EXT2_DIND_BLOCK:
nr = i_data[EXT2_TIND_BLOCK];
if (nr) {
@@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
+int ext2_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_falgs)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ unsigned int flags;
+
+ flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
+ if (flags & EXT2_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (flags & EXT2_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+ if (flags & EXT2_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (flags & EXT2_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_COMPRESSED |
+ STATX_ATTR_ENCRYPTED |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 0c26dcc5d850..ccfbbf59e2fc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
@@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
#endif
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73b2d528237f..0128010a0874 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits)
{
loff_t res = EXT2_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
+ unsigned int upper_limit;
+ unsigned int ppb = 1 << (bits-2);
/* This is calculated to be the largest file size for a
* dense, file such that the total number of
@@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits)
/* total blocks in file system block size */
upper_limit >>= (bits - 9);
+ /* Compute how many blocks we can address by block tree */
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ /* Does block tree limit file size? */
+ if (res < upper_limit)
+ goto check_lfs;
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT2_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+ DIV_ROUND_UP(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
@@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT2_INODE_SIZE(sb) == 0)
- goto cantfind_ext2;
sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
goto cantfind_ext2;
@@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount;
}
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
if (!sbi->s_debts) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "error: not enough memory");
goto failed_mount_group_desc;
}
@@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT2_FS_XATTR
sbi->s_ea_block_cache = ext2_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
+ ret = -ENOMEM;
ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache");
goto failed_mount3;
}
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index d5589ddcc281..00cdb8679486 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -23,6 +23,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.get_link = page_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
@@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.get_link = simple_get_link,
+ .getattr = ext2_getattr,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.listxattr = ext2_listxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 4f30876ee325..1e33e0ac8cf1 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
return;
spin_lock(&EXT2_SB(sb)->s_lock);
+ ext2_update_dynamic_rev(sb);
EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
spin_unlock(&EXT2_SB(sb)->s_lock);
mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 712f00995390..5508baa11bb6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -116,16 +116,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
- ret = file_write_and_wait_range(file, start, end);
- if (ret)
- return ret;
-
if (!journal) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL
- };
-
- ret = ext4_write_inode(inode, &wbc);
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret)
ret = ext4_sync_parent(inode);
if (test_opt(inode->i_sb, BARRIER))
@@ -133,6 +125,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
+ ret = file_write_and_wait_range(file, start, end);
+ if (ret)
+ return ret;
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ebcc121920ba..fd7f170e2f2d 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -506,30 +506,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kvfree(si);
}
-int __init f2fs_create_root_stats(void)
+void __init f2fs_create_root_stats(void)
{
- struct dentry *file;
-
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
- if (!f2fs_debugfs_root)
- return -ENOMEM;
- file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
- NULL, &stat_fops);
- if (!file) {
- debugfs_remove(f2fs_debugfs_root);
- f2fs_debugfs_root = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+ &stat_fops);
}
void f2fs_destroy_root_stats(void)
{
- if (!f2fs_debugfs_root)
- return;
-
debugfs_remove_recursive(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 12fabd6735dd..8f23ee6e8eb9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3328,7 +3328,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *sbi);
void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
-int __init f2fs_create_root_stats(void);
+void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si) do { } while (0)
@@ -3366,7 +3366,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline int __init f2fs_create_root_stats(void) { return 0; }
+static inline void __init f2fs_create_root_stats(void) { }
static inline void f2fs_destroy_root_stats(void) { }
#endif
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c46a1d4318d4..3d3ce9eb6d13 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3545,9 +3545,7 @@ static int __init init_f2fs_fs(void)
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
- err = f2fs_create_root_stats();
- if (err)
- goto free_filesystem;
+ f2fs_create_root_stats();
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
@@ -3555,7 +3553,6 @@ static int __init init_f2fs_fs(void)
free_root_stats:
f2fs_destroy_root_stats();
-free_filesystem:
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 13935ee99e1e..b3bed32946b1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.fallocate = fat_fallocate,
};
diff --git a/fs/file.c b/fs/file.c
index 3209ee271c41..a10487aa0a84 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
diff --git a/fs/fs_types.c b/fs/fs_types.c
new file mode 100644
index 000000000000..78365e5dc08c
--- /dev/null
+++ b/fs/fs_types.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/export.h>
+
+/*
+ * fs on-disk file type to dirent file type conversion
+ */
+static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
+ [FT_UNKNOWN] = DT_UNKNOWN,
+ [FT_REG_FILE] = DT_REG,
+ [FT_DIR] = DT_DIR,
+ [FT_CHRDEV] = DT_CHR,
+ [FT_BLKDEV] = DT_BLK,
+ [FT_FIFO] = DT_FIFO,
+ [FT_SOCK] = DT_SOCK,
+ [FT_SYMLINK] = DT_LNK
+};
+
+/**
+ * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
+ * @filetype: The on-disk file type to convert.
+ *
+ * This function converts the on-disk file type value (FT_*) to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_ftype_to_dtype(unsigned int filetype)
+{
+ if (filetype >= FT_MAX)
+ return DT_UNKNOWN;
+
+ return fs_dtype_by_ftype[filetype];
+}
+EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);
+
+/*
+ * dirent file type to fs on-disk file type conversion
+ * Values not initialized explicitly are FT_UNKNOWN (0).
+ */
+static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
+ [DT_REG] = FT_REG_FILE,
+ [DT_DIR] = FT_DIR,
+ [DT_LNK] = FT_SYMLINK,
+ [DT_CHR] = FT_CHRDEV,
+ [DT_BLK] = FT_BLKDEV,
+ [DT_FIFO] = FT_FIFO,
+ [DT_SOCK] = FT_SOCK,
+};
+
+/**
+ * fs_umode_to_ftype() - file mode to on-disk file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the on-disk file type (FT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * FT_UNKNOWN - Unknown type
+ * * FT_REG_FILE - Regular file
+ * * FT_DIR - Directory
+ * * FT_CHRDEV - Character device
+ * * FT_BLKDEV - Block device
+ * * FT_FIFO - FIFO
+ * * FT_SOCK - Local-domain socket
+ * * FT_SYMLINK - Symbolic link
+ */
+unsigned char fs_umode_to_ftype(umode_t mode)
+{
+ return fs_ftype_by_dtype[S_DT(mode)];
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_ftype);
+
+/**
+ * fs_umode_to_dtype() - file mode to dirent file type.
+ * @mode: The file mode to convert.
+ *
+ * This function converts the file mode value to the directory
+ * entry type (DT_*).
+ *
+ * Context: Any context.
+ * Return:
+ * * DT_UNKNOWN - Unknown type
+ * * DT_FIFO - FIFO
+ * * DT_CHR - Character device
+ * * DT_DIR - Directory
+ * * DT_BLK - Block device
+ * * DT_REG - Regular file
+ * * DT_LNK - Symbolic link
+ * * DT_SOCK - Local-domain socket
+ */
+unsigned char fs_umode_to_dtype(umode_t mode)
+{
+ return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
+}
+EXPORT_SYMBOL_GPL(fs_umode_to_dtype);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f15b4c57c4bd..78510ab91835 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,7 +28,6 @@
#include "util.h"
#include "trans.h"
#include "dir.h"
-#include "lops.h"
struct workqueue_struct *gfs2_freeze_wq;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 5bfaf381921a..b8830fda51e8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
lh->lh_crc = cpu_to_be32(crc);
gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr);
- gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags);
+ gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags);
log_flush_wait(sdp);
}
@@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
gfs2_ordered_write(sdp);
lops_before_commit(sdp, tr);
- gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
+ gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0);
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 15deefeaafd0..8722c60b11fe 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -17,9 +17,7 @@
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/list_sort.h>
-#include <linux/blkdev.h>
-#include "bmap.h"
#include "dir.h"
#include "gfs2.h"
#include "incore.h"
@@ -196,6 +194,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
/**
* gfs2_end_log_write - end of i/o to the log
* @bio: The bio
+ * @error: Status of i/o request
*
* Each bio_vec contains either data from the pagecache or data
* relating to the log itself. Here we iterate over the bio_vec
@@ -233,19 +232,20 @@ static void gfs2_end_log_write(struct bio *bio)
/**
* gfs2_log_submit_bio - Submit any pending log bio
* @biop: Address of the bio pointer
- * @opf: REQ_OP | op_flags
+ * @op: REQ_OP
+ * @op_flags: req_flag_bits
*
* Submit any pending part-built or full bio to the block device. If
* there is no pending bio, then this is a no-op.
*/
-void gfs2_log_submit_bio(struct bio **biop, int opf)
+void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags)
{
struct bio *bio = *biop;
if (bio) {
struct gfs2_sbd *sdp = bio->bi_private;
atomic_inc(&sdp->sd_log_in_flight);
- bio->bi_opf = opf;
+ bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
*biop = NULL;
}
@@ -306,7 +306,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
nblk >>= sdp->sd_fsb2bb_shift;
if (blkno == nblk && !flush)
return bio;
- gfs2_log_submit_bio(biop, op);
+ gfs2_log_submit_bio(biop, op, 0);
}
*biop = gfs2_log_alloc_bio(sdp, blkno, end_io);
@@ -377,185 +377,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
gfs2_log_bmap(sdp));
}
-/**
- * gfs2_end_log_read - end I/O callback for reads from the log
- * @bio: The bio
- *
- * Simply unlock the pages in the bio. The main thread will wait on them and
- * process them in order as necessary.
- */
-
-static void gfs2_end_log_read(struct bio *bio)
-{
- struct page *page;
- struct bio_vec *bvec;
- int i;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, i, iter_all) {
- page = bvec->bv_page;
- if (bio->bi_status) {
- int err = blk_status_to_errno(bio->bi_status);
-
- SetPageError(page);
- mapping_set_error(page->mapping, err);
- }
- unlock_page(page);
- }
-
- bio_put(bio);
-}
-
-/**
- * gfs2_jhead_pg_srch - Look for the journal head in a given page.
- * @jd: The journal descriptor
- * @page: The page to look in
- *
- * Returns: 1 if found, 0 otherwise.
- */
-
-static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head,
- struct page *page)
-{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct gfs2_log_header_host uninitialized_var(lh);
- void *kaddr = kmap_atomic(page);
- unsigned int offset;
- bool ret = false;
-
- for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
- if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
- if (lh.lh_sequence > head->lh_sequence)
- *head = lh;
- else {
- ret = true;
- break;
- }
- }
- }
- kunmap_atomic(kaddr);
- return ret;
-}
-
-/**
- * gfs2_jhead_process_page - Search/cleanup a page
- * @jd: The journal descriptor
- * @index: Index of the page to look into
- * @done: If set, perform only cleanup, else search and set if found.
- *
- * Find the page with 'index' in the journal's mapping. Search the page for
- * the journal head if requested (cleanup == false). Release refs on the
- * page so the page cache can reclaim it (put_page() twice). We grabbed a
- * reference on this page two times, first when we did a find_or_create_page()
- * to obtain the page to add it to the bio and second when we do a
- * find_get_page() here to get the page to wait on while I/O on it is being
- * completed.
- * This function is also used to free up a page we might've grabbed but not
- * used. Maybe we added it to a bio, but not submitted it for I/O. Or we
- * submitted the I/O, but we already found the jhead so we only need to drop
- * our references to the page.
- */
-
-static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
- struct gfs2_log_header_host *head,
- bool *done)
-{
- struct page *page;
-
- page = find_get_page(jd->jd_inode->i_mapping, index);
- wait_on_page_locked(page);
-
- if (PageError(page))
- *done = true;
-
- if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, page);
-
- put_page(page); /* Once for find_get_page */
- put_page(page); /* Once more for find_or_create_page */
-}
-
-/**
- * gfs2_find_jhead - find the head of a log
- * @jd: The journal descriptor
- * @head: The log descriptor for the head of the log is returned here
- *
- * Do a search of a journal by reading it in large chunks using bios and find
- * the valid log entry with the highest sequence number. (i.e. the log head)
- *
- * Returns: 0 on success, errno otherwise
- */
-
-int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
-{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct address_space *mapping = jd->jd_inode->i_mapping;
- struct gfs2_journal_extent *je;
- u32 block, read_idx = 0, submit_idx = 0, index = 0;
- int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
- int blocks_per_page = 1 << shift, sz, ret = 0;
- struct bio *bio = NULL;
- struct page *page;
- bool done = false;
- errseq_t since;
-
- memset(head, 0, sizeof(*head));
- if (list_empty(&jd->extent_list))
- gfs2_map_journal_extents(sdp, jd);
-
- since = filemap_sample_wb_err(mapping);
- list_for_each_entry(je, &jd->extent_list, list) {
- for (block = 0; block < je->blocks; block += blocks_per_page) {
- index = (je->lblock + block) >> shift;
-
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
- done = true;
- goto out;
- }
-
- if (bio) {
- sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (sz == PAGE_SIZE)
- goto page_added;
- submit_idx = index;
- submit_bio(bio);
- bio = NULL;
- }
-
- bio = gfs2_log_alloc_bio(sdp,
- je->dblock + (index << shift),
- gfs2_end_log_read);
- bio->bi_opf = REQ_OP_READ;
- sz = bio_add_page(bio, page, PAGE_SIZE, 0);
- gfs2_assert_warn(sdp, sz == PAGE_SIZE);
-
-page_added:
- if (submit_idx <= read_idx + BIO_MAX_PAGES) {
- /* Keep at least one bio in flight */
- continue;
- }
-
- gfs2_jhead_process_page(jd, read_idx++, head, &done);
- if (done)
- goto out; /* found */
- }
- }
-
-out:
- if (bio)
- submit_bio(bio);
- while (read_idx <= index)
- gfs2_jhead_process_page(jd, read_idx++, head, &done);
-
- if (!ret)
- ret = filemap_check_wb_err(mapping, since);
-
- return ret;
-}
-
static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
u32 ld_length, u32 ld_data1)
{
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 331160fc568b..711c4d89c063 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -30,10 +30,8 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp);
extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page,
unsigned size, unsigned offset, u64 blkno);
extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
-extern void gfs2_log_submit_bio(struct bio **biop, int opf);
+extern void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
-extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1179763f6370..b041cb8ae383 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -41,7 +41,6 @@
#include "dir.h"
#include "meta_io.h"
#include "trace_gfs2.h"
-#include "lops.h"
#define DO 0
#define UNDO 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 7389e445a7a7..2dac43065382 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -182,6 +182,129 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
}
/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+ struct gfs2_log_header_host *head)
+{
+ unsigned int orig_blk = *blk;
+ int error;
+
+ for (;;) {
+ error = get_log_header(jd, *blk, head);
+ if (error <= 0)
+ return error;
+
+ if (++*blk == jd->jd_blocks)
+ *blk = 0;
+
+ if (*blk == orig_blk) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ return -EIO;
+ }
+ }
+}
+
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before. Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+ unsigned int blk = head->lh_blkno;
+ struct gfs2_log_header_host lh;
+ int error;
+
+ for (;;) {
+ if (++blk == jd->jd_blocks)
+ blk = 0;
+
+ error = get_log_header(jd, blk, &lh);
+ if (error < 0)
+ return error;
+ if (error == 1)
+ continue;
+
+ if (lh.lh_sequence == head->lh_sequence) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ return -EIO;
+ }
+ if (lh.lh_sequence < head->lh_sequence)
+ break;
+
+ *head = lh;
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number. (i.e. the log head)
+ *
+ * Returns: errno
+ */
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
+{
+ struct gfs2_log_header_host lh_1, lh_m;
+ u32 blk_1, blk_2, blk_m;
+ int error;
+
+ blk_1 = 0;
+ blk_2 = jd->jd_blocks - 1;
+
+ for (;;) {
+ blk_m = (blk_1 + blk_2) / 2;
+
+ error = find_good_lh(jd, &blk_1, &lh_1);
+ if (error)
+ return error;
+
+ error = find_good_lh(jd, &blk_m, &lh_m);
+ if (error)
+ return error;
+
+ if (blk_1 == blk_m || blk_m == blk_2)
+ break;
+
+ if (lh_1.lh_sequence <= lh_m.lh_sequence)
+ blk_1 = blk_m;
+ else
+ blk_2 = blk_m;
+ }
+
+ error = jhead_scan(jd, &lh_1);
+ if (error)
+ return error;
+
+ *head = lh_1;
+
+ return error;
+}
+
+/**
* foreach_descriptor - go through the active part of the log
* @jd: the journal
* @start: the first log header in the active region
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 99575ab81202..11d81248be85 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -27,6 +27,8 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head);
extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
extern void gfs2_recover_func(struct work_struct *work);
extern int __get_log_header(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d4b11c903971..ca71163ff7cf 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -45,7 +45,6 @@
#include "util.h"
#include "sys.h"
#include "xattr.h"
-#include "lops.h"
#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 32920a10100e..b0eef008de67 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
}
@@ -859,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
+
+ /*
+ * page_private is subpool pointer in hugetlb pages. Transfer to
+ * new page. PagePrivate is not associated with page_private for
+ * hugetlb pages and can not be set here as only page_huge_active
+ * pages can be migrated.
+ */
+ if (page_private(page)) {
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ }
+
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
diff --git a/fs/inode.c b/fs/inode.c
index 0cd47fe0dbe5..e9d97add2b36 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_REMOVED;
}
- /*
- * Recently referenced inodes and inodes with many attached pages
- * get one more pass.
- */
- if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
+ /* recently referenced inodes get one more pass */
+ if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
@@ -2096,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait);
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
- unsigned int old_flags, new_flags;
-
WARN_ON_ONCE(flags & ~mask);
- do {
- old_flags = READ_ONCE(inode->i_flags);
- new_flags = (old_flags & ~mask) | flags;
- } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
- new_flags) != old_flags));
+ set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4ca0b5c18192..b84d635567d3 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -536,8 +536,8 @@ void kernfs_put(struct kernfs_node *kn)
security_release_secctx(kn->iattr->ia_secdata,
kn->iattr->ia_secdata_len);
simple_xattrs_free(&kn->iattr->xattrs);
+ kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
- kfree(kn->iattr);
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, kn->id.ino);
spin_unlock(&kernfs_idr_lock);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f8d5021a652e..ae948aaa4c53 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+ struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+ struct kernfs_open_node *on = kn->attr.open;
+
+ poll_wait(of->file, &on->poll, wait);
+
+ if (of->event != atomic_read(&on->event))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ __poll_t ret;
if (!kernfs_get_active(kn))
- goto trigger;
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait);
+ if (kn->attr.ops->poll)
+ ret = kn->attr.ops->poll(of, wait);
+ else
+ ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
-
- if (of->event != atomic_read(&on->event))
- goto trigger;
-
- return DEFAULT_POLLMASK;
-
- trigger:
- return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+ return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 80cebcd94c90..0c1fd945ce42 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -42,7 +42,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
if (kn->iattr)
goto out_unlock;
- kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+ kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
if (!kn->iattr)
goto out_unlock;
iattrs = &kn->iattr->ia_iattr;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 3d83b114bb08..dba810cd83b1 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -78,7 +78,7 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
}
extern const struct super_operations kernfs_sops;
-extern struct kmem_cache *kernfs_node_cache;
+extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index fdf527b6d79c..f3ac352699cf 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,7 +20,7 @@
#include "kernfs-internal.h"
-struct kmem_cache *kernfs_node_cache;
+struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
{
@@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
return dentry;
knparent = find_next_ancestor(kn, NULL);
- if (WARN_ON(!knparent))
+ if (WARN_ON(!knparent)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
do {
struct dentry *dtmp;
@@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
if (kn == knparent)
return dentry;
kntmp = find_next_ancestor(kn, knparent);
- if (WARN_ON(!kntmp))
+ if (WARN_ON(!kntmp)) {
+ dput(dentry);
return ERR_PTR(-EINVAL);
+ }
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
strlen(kntmp->name));
dput(dentry);
@@ -417,4 +421,9 @@ void __init kernfs_init(void)
0,
SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
NULL);
+
+ /* Creates slab cache for kernfs inode attributes */
+ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
+ sizeof(struct kernfs_iattrs),
+ 0, SLAB_PANIC, NULL);
}
diff --git a/fs/locks.c b/fs/locks.c
index ff6af2c32601..eaa1cfaf73b0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1058,7 +1058,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
return -ENOMEM;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -1100,7 +1100,7 @@ find_conflict:
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -1138,7 +1138,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1312,7 +1312,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1584,7 +1584,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1636,13 +1636,13 @@ restart:
locks_insert_block(fl, new_fl, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_blocker, break_time);
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
@@ -1659,7 +1659,7 @@ restart:
}
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1729,7 +1729,7 @@ int fcntl_getlease(struct file *filp)
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
@@ -1739,7 +1739,7 @@ int fcntl_getlease(struct file *filp)
break;
}
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
@@ -1813,7 +1813,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
return -EINVAL;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1884,7 +1884,7 @@ out_setup:
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
inode_unlock(inode);
@@ -1907,7 +1907,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
return error;
}
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1920,7 +1920,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -2643,13 +2643,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
if (list_empty(&ctx->flc_lease))
return;
- percpu_down_read_preempt_disable(&file_rwsem);
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
if (filp == fl->fl_file)
lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
- percpu_up_read_preempt_enable(&file_rwsem);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
diff --git a/fs/namei.c b/fs/namei.c
index 914178cdbe94..0a8c5c27f90e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -39,7 +39,6 @@
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
-#include <linux/build_bug.h>
#include "internal.h"
#include "mount.h"
@@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
struct filename *result;
char *kname;
int len;
- BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
result = audit_reusename(filename);
if (result)
@@ -2720,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
if (unlikely(error == -ESTALE))
error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
if (likely(!error))
- audit_inode(name, path->dentry, 0);
+ audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL);
restore_nameidata();
putname(name);
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index a677b59efd74..98a8c182af4f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1640,6 +1640,8 @@ int ksys_umount(char __user *name, int flags)
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+ lookup_flags |= LOOKUP_NO_EVAL;
+
retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
if (retval)
goto out;
@@ -2698,7 +2700,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
if (!access_ok(from, n))
return n;
- current->kernel_uaccess_faults_ok++;
while (n) {
if (__get_user(c, f)) {
memset(t, 0, n);
@@ -2708,7 +2709,6 @@ static long exact_copy_from_user(void *to, const void __user * from,
f++;
n--;
}
- current->kernel_uaccess_faults_ok--;
return n;
}
@@ -2746,7 +2746,7 @@ void *copy_mount_options(const void __user * data)
char *copy_mount_string(const void __user *data)
{
- return data ? strndup_user(data, PAGE_SIZE) : NULL;
+ return data ? strndup_user(data, PATH_MAX) : NULL;
}
/*
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3f23b6840547..bf34ddaa2ad7 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -44,6 +44,7 @@
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
#include <linux/module.h>
#include "internal.h"
@@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy;
struct idmap_legacy_upcalldata {
struct rpc_pipe_msg pipe_msg;
struct idmap_msg idmap_msg;
- struct key_construction *key_cons;
+ struct key *authkey;
struct idmap *idmap;
};
@@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = {
{ Opt_find_err, NULL }
};
-static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static int nfs_idmap_legacy_upcall(struct key *, void *);
static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
size_t);
static void idmap_release_pipe(struct inode *);
@@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
static void
nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
{
- struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+ struct key *authkey = idmap->idmap_upcall_data->authkey;
kfree(idmap->idmap_upcall_data);
idmap->idmap_upcall_data = NULL;
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
+ key_put(authkey);
}
static void
@@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
}
-static int nfs_idmap_legacy_upcall(struct key_construction *cons,
- const char *op,
- void *aux)
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
{
struct idmap_legacy_upcalldata *data;
+ struct request_key_auth *rka = get_request_key_auth(authkey);
struct rpc_pipe_msg *msg;
struct idmap_msg *im;
struct idmap *idmap = (struct idmap *)aux;
- struct key *key = cons->key;
+ struct key *key = rka->target_key;
int ret = -ENOKEY;
if (!aux)
@@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
msg = &data->pipe_msg;
im = &data->idmap_msg;
data->idmap = idmap;
- data->key_cons = cons;
+ data->authkey = key_get(authkey);
ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
if (ret < 0)
@@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
out2:
kfree(data);
out1:
- complete_request_key(cons, ret);
+ complete_request_key(authkey, ret);
return ret;
}
@@ -651,9 +652,10 @@ out:
static ssize_t
idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
{
+ struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
- struct key_construction *cons;
+ struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
int ret = -ENOKEY;
@@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (idmap->idmap_upcall_data == NULL)
goto out_noupcall;
- cons = idmap->idmap_upcall_data->key_cons;
+ authkey = idmap->idmap_upcall_data->authkey;
+ rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
ret = -ENOSPC;
@@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
ret = nfs_idmap_read_and_verify_message(&im,
&idmap->idmap_upcall_data->idmap_msg,
- cons->key, cons->authkey);
+ rka->target_key, authkey);
if (ret >= 0) {
- key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+ key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f12cb31a41e5..d09c9f878141 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -238,9 +238,9 @@ out:
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
-static void nfs_set_pageerror(struct page *page)
+static void nfs_set_pageerror(struct address_space *mapping)
{
- nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
+ nfs_zap_mapping(mapping->host, mapping);
}
/*
@@ -994,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
- nfs_set_pageerror(req->wb_page);
+ nfs_set_pageerror(page_file_mapping(req->wb_page));
nfs_context_set_write_error(req->wb_context, hdr->error);
goto remove_req;
}
@@ -1348,7 +1348,8 @@ int nfs_updatepage(struct file *file, struct page *page,
unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = page_file_mapping(page)->host;
+ struct address_space *mapping = page_file_mapping(page);
+ struct inode *inode = mapping->host;
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -1366,7 +1367,7 @@ int nfs_updatepage(struct file *file, struct page *page,
status = nfs_writepage_setup(ctx, page, offset, count);
if (status < 0)
- nfs_set_pageerror(page);
+ nfs_set_pageerror(mapping);
else
__set_page_dirty_nobuffers(page);
out:
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b33f9785b756..72a7681f4046 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1239,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
- nn->nfsd4_lease = 45; /* default lease time */
- nn->nfsd4_grace = 45;
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
nn->somebody_reclaimed = false;
nn->clverifier_counter = prandom_u32();
nn->clientid_counter = prandom_u32();
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 41355ce74ac0..735bfb2e9190 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -2,6 +2,7 @@ config FANOTIFY
bool "Filesystem wide access notification"
select FSNOTIFY
select ANON_INODES
+ select EXPORTFS
default n
---help---
Say Y here to enable fanotify support. fanotify is a file access
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 3723f3d18d20..6b9c27548997 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -13,22 +13,40 @@
#include <linux/wait.h>
#include <linux/audit.h>
#include <linux/sched/mm.h>
+#include <linux/statfs.h>
#include "fanotify.h"
static bool should_merge(struct fsnotify_event *old_fsn,
struct fsnotify_event *new_fsn)
{
- struct fanotify_event_info *old, *new;
+ struct fanotify_event *old, *new;
pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode == new_fsn->inode && old->pid == new->pid &&
- old->path.mnt == new->path.mnt &&
- old->path.dentry == new->path.dentry)
- return true;
+ if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+ old->fh_type != new->fh_type || old->fh_len != new->fh_len)
+ return false;
+
+ if (fanotify_event_has_path(old)) {
+ return old->path.mnt == new->path.mnt &&
+ old->path.dentry == new->path.dentry;
+ } else if (fanotify_event_has_fid(old)) {
+ /*
+ * We want to merge many dirent events in the same dir (i.e.
+ * creates/unlinks/renames), but we do not want to merge dirent
+ * events referring to subdirs with dirent events referring to
+ * non subdirs, otherwise, user won't be able to tell from a
+ * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+
+ * unlink pair or rmdir+create pair of events.
+ */
+ return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) &&
+ fanotify_fid_equal(&old->fid, &new->fid, old->fh_len);
+ }
+
+ /* Do not merge events if we failed to encode fid */
return false;
}
@@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn,
static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
{
struct fsnotify_event *test_event;
+ struct fanotify_event *new;
pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+ new = FANOTIFY_E(event);
/*
* Don't merge a permission event with any other event so that we know
* the event structure we have created in fanotify_handle_event() is the
* one we should check for permission response.
*/
- if (fanotify_is_perm_event(event->mask))
+ if (fanotify_is_perm_event(new->mask))
return 0;
list_for_each_entry_reverse(test_event, list, list) {
if (should_merge(test_event, event)) {
- test_event->mask |= event->mask;
+ FANOTIFY_E(test_event)->mask |= new->mask;
return 1;
}
}
@@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
return 0;
}
+/*
+ * Wait for response to permission event. The function also takes care of
+ * freeing the permission event (or offloads that in case the wait is canceled
+ * by a signal). The function returns 0 in case access got allowed by userspace,
+ * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case
+ * the wait got interrupted by a signal.
+ */
static int fanotify_get_response(struct fsnotify_group *group,
- struct fanotify_perm_event_info *event,
+ struct fanotify_perm_event *event,
struct fsnotify_iter_info *iter_info)
{
int ret;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- wait_event(group->fanotify_data.access_waitq, event->response);
+ ret = wait_event_killable(group->fanotify_data.access_waitq,
+ event->state == FAN_EVENT_ANSWERED);
+ /* Signal pending? */
+ if (ret < 0) {
+ spin_lock(&group->notification_lock);
+ /* Event reported to userspace and no answer yet? */
+ if (event->state == FAN_EVENT_REPORTED) {
+ /* Event will get freed once userspace answers to it */
+ event->state = FAN_EVENT_CANCELED;
+ spin_unlock(&group->notification_lock);
+ return ret;
+ }
+ /* Event not yet reported? Just remove it. */
+ if (event->state == FAN_EVENT_INIT)
+ fsnotify_remove_queued_event(group, &event->fae.fse);
+ /*
+ * Event may be also answered in case signal delivery raced
+ * with wakeup. In that case we have nothing to do besides
+ * freeing the event and reporting error.
+ */
+ spin_unlock(&group->notification_lock);
+ goto out;
+ }
/* userspace responded, convert to something usable */
switch (event->response & ~FAN_AUDIT) {
@@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
if (event->response & FAN_AUDIT)
audit_fanotify(event->response & ~FAN_AUDIT);
- event->response = 0;
-
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
-
+out:
+ fsnotify_destroy_event(group, &event->fae.fse);
+
return ret;
}
@@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group,
* been included within the event mask, but have not been explicitly
* requested by the user, will not be present in the returned mask.
*/
-static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
- u32 event_mask, const void *data,
- int data_type)
+static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ struct fsnotify_iter_info *iter_info,
+ u32 event_mask, const void *data,
+ int data_type)
{
__u32 marks_mask = 0, marks_ignored_mask = 0;
+ __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS;
const struct path *path = data;
struct fsnotify_mark *mark;
int type;
@@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- /* If we don't have enough info to send an event to userspace say no */
- if (data_type != FSNOTIFY_EVENT_PATH)
- return 0;
-
- /* Sorry, fanotify only gives a damn about files and dirs */
- if (!d_is_reg(path->dentry) &&
- !d_can_lookup(path->dentry))
- return 0;
+ if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do we have path to open a file descriptor? */
+ if (data_type != FSNOTIFY_EVENT_PATH)
+ return 0;
+ /* Path type events are only relevant for files and dirs */
+ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry))
+ return 0;
+ }
fsnotify_foreach_obj_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
@@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info,
marks_ignored_mask |= mark->ignored_mask;
}
- if (d_is_dir(path->dentry) &&
+ test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+
+ /*
+ * dirent modification events (create/delete/move) do not carry the
+ * child entry name/inode information. Instead, we report FAN_ONDIR
+ * for mkdir/rmdir so user can differentiate them from creat/unlink.
+ *
+ * For backward compatibility and consistency, do not report FAN_ONDIR
+ * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR
+ * to user in FAN_REPORT_FID mode for all event types.
+ */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Do not report FAN_ONDIR without any event */
+ if (!(test_mask & ~FAN_ONDIR))
+ return 0;
+ } else {
+ user_mask &= ~FAN_ONDIR;
+ }
+
+ if (event_mask & FS_ISDIR &&
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return 0;
- return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask &
- ~marks_ignored_mask;
+ return test_mask & user_mask;
+}
+
+static int fanotify_encode_fid(struct fanotify_event *event,
+ struct inode *inode, gfp_t gfp,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_fid *fid = &event->fid;
+ int dwords, bytes = 0;
+ int err, type;
+
+ fid->ext_fh = NULL;
+ dwords = 0;
+ err = -ENOENT;
+ type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+ if (!dwords)
+ goto out_err;
+
+ bytes = dwords << 2;
+ if (bytes > FANOTIFY_INLINE_FH_LEN) {
+ /* Treat failure to allocate fh as failure to allocate event */
+ err = -ENOMEM;
+ fid->ext_fh = kmalloc(bytes, gfp);
+ if (!fid->ext_fh)
+ goto out_err;
+ }
+
+ type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes),
+ &dwords, NULL);
+ err = -EINVAL;
+ if (!type || type == FILEID_INVALID || bytes != dwords << 2)
+ goto out_err;
+
+ fid->fsid = *fsid;
+ event->fh_len = bytes;
+
+ return type;
+
+out_err:
+ pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, "
+ "type=%d, bytes=%d, err=%i)\n",
+ fsid->val[0], fsid->val[1], type, bytes, err);
+ kfree(fid->ext_fh);
+ fid->ext_fh = NULL;
+ event->fh_len = 0;
+
+ return FILEID_INVALID;
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path)
+/*
+ * The inode to use as identifier when reporting fid depends on the event.
+ * Report the modified directory inode on dirent modification events.
+ * Report the "victim" inode otherwise.
+ * For example:
+ * FS_ATTRIB reports the child inode even if reported on a watched parent.
+ * FS_CREATE reports the modified dir inode and not the created inode.
+ */
+static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask,
+ const void *data, int data_type)
{
- struct fanotify_event_info *event = NULL;
+ if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ return to_tell;
+ else if (data_type == FSNOTIFY_EVENT_INODE)
+ return (struct inode *)data;
+ else if (data_type == FSNOTIFY_EVENT_PATH)
+ return d_inode(((struct path *)data)->dentry);
+ return NULL;
+}
+
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid)
+{
+ struct fanotify_event *event = NULL;
gfp_t gfp = GFP_KERNEL_ACCOUNT;
+ struct inode *id = fanotify_fid_inode(inode, mask, data, data_type);
/*
* For queues with unlimited length lost events are not expected and
@@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
memalloc_use_memcg(group->memcg);
if (fanotify_is_perm_event(mask)) {
- struct fanotify_perm_event_info *pevent;
+ struct fanotify_perm_event *pevent;
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
if (!pevent)
goto out;
event = &pevent->fae;
pevent->response = 0;
+ pevent->state = FAN_EVENT_INIT;
goto init;
}
event = kmem_cache_alloc(fanotify_event_cachep, gfp);
if (!event)
goto out;
init: __maybe_unused
- fsnotify_init_event(&event->fse, inode, mask);
+ fsnotify_init_event(&event->fse, inode);
+ event->mask = mask;
if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
event->pid = get_pid(task_pid(current));
else
event->pid = get_pid(task_tgid(current));
- if (path) {
- event->path = *path;
+ event->fh_len = 0;
+ if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ /* Report the event without a file identifier on encode error */
+ event->fh_type = fanotify_encode_fid(event, id, gfp, fsid);
+ } else if (data_type == FSNOTIFY_EVENT_PATH) {
+ event->fh_type = FILEID_ROOT;
+ event->path = *((struct path *)data);
path_get(&event->path);
} else {
+ event->fh_type = FILEID_INVALID;
event->path.mnt = NULL;
event->path.dentry = NULL;
}
@@ -190,6 +335,29 @@ out:
return event;
}
+/*
+ * Get cached fsid of the filesystem containing the object from any connector.
+ * All connectors are supposed to have the same fsid, but we do not verify that
+ * here.
+ */
+static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+{
+ int type;
+ __kernel_fsid_t fsid = {};
+
+ fsnotify_foreach_obj_type(type) {
+ if (!fsnotify_iter_should_report_type(iter_info, type))
+ continue;
+
+ fsid = iter_info->marks[type]->connector->fsid;
+ if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+ continue;
+ return fsid;
+ }
+
+ return fsid;
+}
+
static int fanotify_handle_event(struct fsnotify_group *group,
struct inode *inode,
u32 mask, const void *data, int data_type,
@@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct fsnotify_iter_info *iter_info)
{
int ret = 0;
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
struct fsnotify_event *fsn_event;
+ __kernel_fsid_t fsid = {};
BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB);
BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+ BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO);
+ BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM);
+ BUILD_BUG_ON(FAN_CREATE != FS_CREATE);
+ BUILD_BUG_ON(FAN_DELETE != FS_DELETE);
+ BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF);
+ BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF);
BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
@@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group,
BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
- mask = fanotify_group_event_mask(iter_info, mask, data, data_type);
+ mask = fanotify_group_event_mask(group, iter_info, mask, data,
+ data_type);
if (!mask)
return 0;
@@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
return 0;
}
- event = fanotify_alloc_event(group, inode, mask, data);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID))
+ fsid = fanotify_get_fsid(iter_info);
+
+ event = fanotify_alloc_event(group, inode, mask, data, data_type,
+ &fsid);
ret = -ENOMEM;
if (unlikely(!event)) {
/*
@@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group,
} else if (fanotify_is_perm_event(mask)) {
ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
iter_info);
- fsnotify_destroy_event(group, fsn_event);
}
finish:
if (fanotify_is_perm_event(mask))
@@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
static void fanotify_free_event(struct fsnotify_event *fsn_event)
{
- struct fanotify_event_info *event;
+ struct fanotify_event *event;
event = FANOTIFY_E(fsn_event);
- path_put(&event->path);
+ if (fanotify_event_has_path(event))
+ path_put(&event->path);
+ else if (fanotify_event_has_ext_fh(event))
+ kfree(event->fid.ext_fh);
put_pid(event->pid);
- if (fanotify_is_perm_event(fsn_event->mask)) {
+ if (fanotify_is_perm_event(event->mask)) {
kmem_cache_free(fanotify_perm_event_cachep,
FANOTIFY_PE(fsn_event));
return;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index ea05b8a401e7..68b30504284c 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,26 +2,112 @@
#include <linux/fsnotify_backend.h>
#include <linux/path.h>
#include <linux/slab.h>
+#include <linux/exportfs.h>
extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+/* Possible states of the permission event */
+enum {
+ FAN_EVENT_INIT,
+ FAN_EVENT_REPORTED,
+ FAN_EVENT_ANSWERED,
+ FAN_EVENT_CANCELED,
+};
+
+/*
+ * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation).
+ * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and
+ * fh_* fields increase the size of fanotify_event by another 4 bytes.
+ * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and
+ * fh_* fields are packed in a hole after mask.
+ */
+#if BITS_PER_LONG == 32
+#define FANOTIFY_INLINE_FH_LEN (3 << 2)
+#else
+#define FANOTIFY_INLINE_FH_LEN (4 << 2)
+#endif
+
+struct fanotify_fid {
+ __kernel_fsid_t fsid;
+ union {
+ unsigned char fh[FANOTIFY_INLINE_FH_LEN];
+ unsigned char *ext_fh;
+ };
+};
+
+static inline void *fanotify_fid_fh(struct fanotify_fid *fid,
+ unsigned int fh_len)
+{
+ return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh;
+}
+
+static inline bool fanotify_fid_equal(struct fanotify_fid *fid1,
+ struct fanotify_fid *fid2,
+ unsigned int fh_len)
+{
+ return fid1->fsid.val[0] == fid2->fsid.val[0] &&
+ fid1->fsid.val[1] == fid2->fsid.val[1] &&
+ !memcmp(fanotify_fid_fh(fid1, fh_len),
+ fanotify_fid_fh(fid2, fh_len), fh_len);
+}
+
/*
* Structure for normal fanotify events. It gets allocated in
* fanotify_handle_event() and freed when the information is retrieved by
* userspace
*/
-struct fanotify_event_info {
+struct fanotify_event {
struct fsnotify_event fse;
+ u32 mask;
/*
- * We hold ref to this path so it may be dereferenced at any point
- * during this object's lifetime
+ * Those fields are outside fanotify_fid to pack fanotify_event nicely
+ * on 64bit arch and to use fh_type as an indication of whether path
+ * or fid are used in the union:
+ * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither.
*/
- struct path path;
+ u8 fh_type;
+ u8 fh_len;
+ u16 pad;
+ union {
+ /*
+ * We hold ref to this path so it may be dereferenced at any
+ * point during this object's lifetime
+ */
+ struct path path;
+ /*
+ * With FAN_REPORT_FID, we do not hold any reference on the
+ * victim object. Instead we store its NFS file handle and its
+ * filesystem's fsid as a unique identifier.
+ */
+ struct fanotify_fid fid;
+ };
struct pid *pid;
};
+static inline bool fanotify_event_has_path(struct fanotify_event *event)
+{
+ return event->fh_type == FILEID_ROOT;
+}
+
+static inline bool fanotify_event_has_fid(struct fanotify_event *event)
+{
+ return event->fh_type != FILEID_ROOT &&
+ event->fh_type != FILEID_INVALID;
+}
+
+static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event)
+{
+ return fanotify_event_has_fid(event) &&
+ event->fh_len > FANOTIFY_INLINE_FH_LEN;
+}
+
+static inline void *fanotify_event_fh(struct fanotify_event *event)
+{
+ return fanotify_fid_fh(&event->fid, event->fh_len);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -29,16 +115,17 @@ struct fanotify_event_info {
* group->notification_list to group->fanotify_data.access_list to wait for
* user response.
*/
-struct fanotify_perm_event_info {
- struct fanotify_event_info fae;
- int response; /* userspace answer to question */
+struct fanotify_perm_event {
+ struct fanotify_event fae;
+ unsigned short response; /* userspace answer to the event */
+ unsigned short state; /* state of the event */
int fd; /* fd we passed to userspace for this event */
};
-static inline struct fanotify_perm_event_info *
+static inline struct fanotify_perm_event *
FANOTIFY_PE(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+ return container_of(fse, struct fanotify_perm_event, fae.fse);
}
static inline bool fanotify_is_perm_event(u32 mask)
@@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask)
mask & FANOTIFY_PERM_EVENTS;
}
-static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
{
- return container_of(fse, struct fanotify_event_info, fse);
+ return container_of(fse, struct fanotify_event, fse);
}
-struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group,
- struct inode *inode, u32 mask,
- const struct path *path);
+struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ struct inode *inode, u32 mask,
+ const void *data, int data_type,
+ __kernel_fsid_t *fsid);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9c870b0d2b56..56992b32c6bb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,8 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
#include <asm/ioctls.h>
@@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly;
struct kmem_cache *fanotify_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+#define FANOTIFY_EVENT_ALIGN 4
+
+static int fanotify_event_info_len(struct fanotify_event *event)
+{
+ if (!fanotify_event_has_fid(event))
+ return 0;
+
+ return roundup(sizeof(struct fanotify_event_info_fid) +
+ sizeof(struct file_handle) + event->fh_len,
+ FANOTIFY_EVENT_ALIGN);
+}
+
/*
* Get an fsnotify notification event if one exists and is small
* enough to fit in "count". Return an error pointer if the count
- * is not large enough.
- *
- * Called with the group->notification_lock held.
+ * is not large enough. When permission event is dequeued, its state is
+ * updated accordingly.
*/
static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
- assert_spin_locked(&group->notification_lock);
+ size_t event_size = FAN_EVENT_METADATA_LEN;
+ struct fsnotify_event *fsn_event = NULL;
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+ spin_lock(&group->notification_lock);
if (fsnotify_notify_queue_is_empty(group))
- return NULL;
+ goto out;
- if (FAN_EVENT_METADATA_LEN > count)
- return ERR_PTR(-EINVAL);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ event_size += fanotify_event_info_len(
+ FANOTIFY_E(fsnotify_peek_first_event(group)));
+ }
- /* held the notification_lock the whole time, so this is the
- * same event we peeked above */
- return fsnotify_remove_first_event(group);
+ if (event_size > count) {
+ fsn_event = ERR_PTR(-EINVAL);
+ goto out;
+ }
+ fsn_event = fsnotify_remove_first_event(group);
+ if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask))
+ FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED;
+out:
+ spin_unlock(&group->notification_lock);
+ return fsn_event;
}
static int create_fd(struct fsnotify_group *group,
- struct fanotify_event_info *event,
+ struct fanotify_event *event,
struct file **file)
{
int client_fd;
@@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group,
return client_fd;
}
-static int fill_event_metadata(struct fsnotify_group *group,
- struct fanotify_event_metadata *metadata,
- struct fsnotify_event *fsn_event,
- struct file **file)
-{
- int ret = 0;
- struct fanotify_event_info *event;
-
- pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
- group, metadata, fsn_event);
-
- *file = NULL;
- event = container_of(fsn_event, struct fanotify_event_info, fse);
- metadata->event_len = FAN_EVENT_METADATA_LEN;
- metadata->metadata_len = FAN_EVENT_METADATA_LEN;
- metadata->vers = FANOTIFY_METADATA_VERSION;
- metadata->reserved = 0;
- metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS;
- metadata->pid = pid_vnr(event->pid);
- if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
- metadata->fd = FAN_NOFD;
- else {
- metadata->fd = create_fd(group, event, file);
- if (metadata->fd < 0)
- ret = metadata->fd;
- }
-
- return ret;
-}
-
-static struct fanotify_perm_event_info *dequeue_event(
- struct fsnotify_group *group, int fd)
+/*
+ * Finish processing of permission event by setting it to ANSWERED state and
+ * drop group->notification_lock.
+ */
+static void finish_permission_event(struct fsnotify_group *group,
+ struct fanotify_perm_event *event,
+ unsigned int response)
+ __releases(&group->notification_lock)
{
- struct fanotify_perm_event_info *event, *return_e = NULL;
-
- spin_lock(&group->notification_lock);
- list_for_each_entry(event, &group->fanotify_data.access_list,
- fae.fse.list) {
- if (event->fd != fd)
- continue;
+ bool destroy = false;
- list_del_init(&event->fae.fse.list);
- return_e = event;
- break;
- }
+ assert_spin_locked(&group->notification_lock);
+ event->response = response;
+ if (event->state == FAN_EVENT_CANCELED)
+ destroy = true;
+ else
+ event->state = FAN_EVENT_ANSWERED;
spin_unlock(&group->notification_lock);
-
- pr_debug("%s: found return_re=%p\n", __func__, return_e);
-
- return return_e;
+ if (destroy)
+ fsnotify_destroy_event(group, &event->fae.fse);
}
static int process_access_response(struct fsnotify_group *group,
struct fanotify_response *response_struct)
{
- struct fanotify_perm_event_info *event;
+ struct fanotify_perm_event *event;
int fd = response_struct->fd;
int response = response_struct->response;
@@ -194,48 +188,115 @@ static int process_access_response(struct fsnotify_group *group,
if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
return -EINVAL;
- event = dequeue_event(group, fd);
- if (!event)
- return -ENOENT;
+ spin_lock(&group->notification_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (event->fd != fd)
+ continue;
- event->response = response;
- wake_up(&group->fanotify_data.access_waitq);
+ list_del_init(&event->fae.fse.list);
+ finish_permission_event(group, event, response);
+ wake_up(&group->fanotify_data.access_waitq);
+ return 0;
+ }
+ spin_unlock(&group->notification_lock);
+
+ return -ENOENT;
+}
+
+static int copy_fid_to_user(struct fanotify_event *event, char __user *buf)
+{
+ struct fanotify_event_info_fid info = { };
+ struct file_handle handle = { };
+ size_t fh_len = event->fh_len;
+ size_t len = fanotify_event_info_len(event);
+
+ if (!len)
+ return 0;
+
+ if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len))
+ return -EFAULT;
+
+ /* Copy event info fid header followed by vaiable sized file handle */
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID;
+ info.hdr.len = len;
+ info.fsid = event->fid.fsid;
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ buf += sizeof(info);
+ len -= sizeof(info);
+ handle.handle_type = event->fh_type;
+ handle.handle_bytes = fh_len;
+ if (copy_to_user(buf, &handle, sizeof(handle)))
+ return -EFAULT;
+
+ buf += sizeof(handle);
+ len -= sizeof(handle);
+ if (copy_to_user(buf, fanotify_event_fh(event), fh_len))
+ return -EFAULT;
+
+ /* Pad with 0's */
+ buf += fh_len;
+ len -= fh_len;
+ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
+ if (len > 0 && clear_user(buf, len))
+ return -EFAULT;
return 0;
}
static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *event,
+ struct fsnotify_event *fsn_event,
char __user *buf, size_t count)
{
- struct fanotify_event_metadata fanotify_event_metadata;
- struct file *f;
- int fd, ret;
-
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
- ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
- if (ret < 0)
- return ret;
+ struct fanotify_event_metadata metadata;
+ struct fanotify_event *event;
+ struct file *f = NULL;
+ int ret, fd = FAN_NOFD;
+
+ pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+
+ event = container_of(fsn_event, struct fanotify_event, fse);
+ metadata.event_len = FAN_EVENT_METADATA_LEN;
+ metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+ metadata.vers = FANOTIFY_METADATA_VERSION;
+ metadata.reserved = 0;
+ metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+ metadata.pid = pid_vnr(event->pid);
+
+ if (fanotify_event_has_path(event)) {
+ fd = create_fd(group, event, &f);
+ if (fd < 0)
+ return fd;
+ } else if (fanotify_event_has_fid(event)) {
+ metadata.event_len += fanotify_event_info_len(event);
+ }
+ metadata.fd = fd;
- fd = fanotify_event_metadata.fd;
ret = -EFAULT;
/*
* Sanity check copy size in case get_one_event() and
* fill_event_metadata() event_len sizes ever get out of sync.
*/
- if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count))
+ if (WARN_ON_ONCE(metadata.event_len > count))
goto out_close_fd;
- if (copy_to_user(buf, &fanotify_event_metadata,
- fanotify_event_metadata.event_len))
+
+ if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
goto out_close_fd;
if (fanotify_is_perm_event(event->mask))
- FANOTIFY_PE(event)->fd = fd;
+ FANOTIFY_PE(fsn_event)->fd = fd;
- if (fd != FAN_NOFD)
+ if (fanotify_event_has_path(event)) {
fd_install(fd, f);
- return fanotify_event_metadata.event_len;
+ } else if (fanotify_event_has_fid(event)) {
+ ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN);
+ if (ret < 0)
+ return ret;
+ }
+
+ return metadata.event_len;
out_close_fd:
if (fd != FAN_NOFD) {
@@ -276,10 +337,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- spin_lock(&group->notification_lock);
kevent = get_one_event(group, count);
- spin_unlock(&group->notification_lock);
-
if (IS_ERR(kevent)) {
ret = PTR_ERR(kevent);
break;
@@ -316,11 +374,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
- if (!fanotify_is_perm_event(kevent->mask)) {
+ if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) {
fsnotify_destroy_event(group, kevent);
} else {
if (ret <= 0) {
- FANOTIFY_PE(kevent)->response = FAN_DENY;
+ spin_lock(&group->notification_lock);
+ finish_permission_event(group,
+ FANOTIFY_PE(kevent), FAN_DENY);
wake_up(&group->fanotify_data.access_waitq);
} else {
spin_lock(&group->notification_lock);
@@ -370,7 +430,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
- struct fanotify_perm_event_info *event, *next;
+ struct fanotify_perm_event *event;
struct fsnotify_event *fsn_event;
/*
@@ -385,13 +445,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* and simulate reply from userspace.
*/
spin_lock(&group->notification_lock);
- list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
- fae.fse.list) {
- pr_debug("%s: found group=%p event=%p\n", __func__, group,
- event);
-
+ while (!list_empty(&group->fanotify_data.access_list)) {
+ event = list_first_entry(&group->fanotify_data.access_list,
+ struct fanotify_perm_event, fae.fse.list);
list_del_init(&event->fae.fse.list);
- event->response = FAN_ALLOW;
+ finish_permission_event(group, event, FAN_ALLOW);
+ spin_lock(&group->notification_lock);
}
/*
@@ -401,13 +460,14 @@ static int fanotify_release(struct inode *ignored, struct file *file)
*/
while (!fsnotify_notify_queue_is_empty(group)) {
fsn_event = fsnotify_remove_first_event(group);
- if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) {
+ if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, fsn_event);
- spin_lock(&group->notification_lock);
} else {
- FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+ finish_permission_event(group, FANOTIFY_PE(fsn_event),
+ FAN_ALLOW);
}
+ spin_lock(&group->notification_lock);
}
spin_unlock(&group->notification_lock);
@@ -598,7 +658,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *mark;
int ret;
@@ -611,7 +672,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
return ERR_PTR(-ENOMEM);
fsnotify_init_mark(mark, group);
- ret = fsnotify_add_mark_locked(mark, connp, type, 0);
+ ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
return ERR_PTR(ret);
@@ -623,7 +684,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int type,
- __u32 mask, unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
__u32 added;
@@ -631,7 +693,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
mutex_lock(&group->mark_mutex);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, type);
+ fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
if (IS_ERR(fsn_mark)) {
mutex_unlock(&group->mark_mutex);
return PTR_ERR(fsn_mark);
@@ -648,23 +710,23 @@ static int fanotify_add_mark(struct fsnotify_group *group,
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags);
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
}
static int fanotify_add_sb_mark(struct fsnotify_group *group,
- struct super_block *sb, __u32 mask,
- unsigned int flags)
+ struct super_block *sb, __u32 mask,
+ unsigned int flags, __kernel_fsid_t *fsid)
{
return fanotify_add_mark(group, &sb->s_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_SB, mask, flags);
+ FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
}
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
- unsigned int flags)
+ unsigned int flags, __kernel_fsid_t *fsid)
{
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -679,7 +741,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
return 0;
return fanotify_add_mark(group, &inode->i_fsnotify_marks,
- FSNOTIFY_OBJ_TYPE_INODE, mask, flags);
+ FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
}
/* fanotify syscalls */
@@ -688,7 +750,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
struct fsnotify_group *group;
int f_flags, fd;
struct user_struct *user;
- struct fanotify_event_info *oevent;
+ struct fanotify_event *oevent;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -715,6 +777,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
return -EINVAL;
}
+ if ((flags & FAN_REPORT_FID) &&
+ (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF)
+ return -EINVAL;
+
user = get_current_user();
if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
free_uid(user);
@@ -739,7 +805,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
atomic_inc(&user->fanotify_listeners);
group->memcg = get_mem_cgroup_from_mm(current->mm);
- oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL);
+ oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL,
+ FSNOTIFY_EVENT_NONE, NULL);
if (unlikely(!oevent)) {
fd = -ENOMEM;
goto out_destroy_group;
@@ -801,6 +868,48 @@ out_destroy_group:
return fd;
}
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+{
+ __kernel_fsid_t root_fsid;
+ int err;
+
+ /*
+ * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+ */
+ err = vfs_get_fsid(path->dentry, fsid);
+ if (err)
+ return err;
+
+ if (!fsid->val[0] && !fsid->val[1])
+ return -ENODEV;
+
+ /*
+ * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+ * which uses a different fsid than sb root.
+ */
+ err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+ if (err)
+ return err;
+
+ if (root_fsid.val[0] != fsid->val[0] ||
+ root_fsid.val[1] != fsid->val[1])
+ return -EXDEV;
+
+ /*
+ * We need to make sure that the file system supports at least
+ * encoding a file handle so user can use name_to_handle_at() to
+ * compare fid returned with event to the file handle of watched
+ * objects. However, name_to_handle_at() requires that the
+ * filesystem also supports decoding file handles.
+ */
+ if (!path->dentry->d_sb->s_export_op ||
+ !path->dentry->d_sb->s_export_op->fh_to_dentry)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
@@ -809,6 +918,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
+ __kernel_fsid_t __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
int ret;
@@ -871,6 +981,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ /*
+ * Events with data type inode do not carry enough information to report
+ * event->fd, so we do not allow setting a mask for inode events unless
+ * group supports reporting fid.
+ * inode events are not supported on a mount mark, because they do not
+ * carry enough information (i.e. path) to be filtered by mount point.
+ */
+ if (mask & FANOTIFY_INODE_EVENTS &&
+ (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) ||
+ mark_type == FAN_MARK_MOUNT))
+ goto fput_and_out;
+
if (flags & FAN_MARK_FLUSH) {
ret = 0;
if (mark_type == FAN_MARK_MOUNT)
@@ -886,6 +1008,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto fput_and_out;
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+ ret = fanotify_test_fid(&path, &__fsid);
+ if (ret)
+ goto path_put_and_out;
+
+ fsid = &__fsid;
+ }
+
/* inode held in place by reference to path; group by fget on fd */
if (mark_type == FAN_MARK_INODE)
inode = path.dentry->d_inode;
@@ -896,24 +1026,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
+ ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+ flags, fsid);
else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags);
+ ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
+ flags, fsid);
else
- ret = fanotify_add_inode_mark(group, inode, mask, flags);
+ ret = fanotify_add_inode_mark(group, inode, mask,
+ flags, fsid);
break;
case FAN_MARK_REMOVE:
if (mark_type == FAN_MARK_MOUNT)
- ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags);
+ ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
+ flags);
else if (mark_type == FAN_MARK_FILESYSTEM)
- ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags);
+ ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
+ flags);
else
- ret = fanotify_remove_inode_mark(group, inode, mask, flags);
+ ret = fanotify_remove_inode_mark(group, inode, mask,
+ flags);
break;
default:
ret = -EINVAL;
}
+path_put_and_out:
path_put(&path);
fput_and_out:
fdput(f);
@@ -950,15 +1087,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
- fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+ fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC);
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
fanotify_perm_event_cachep =
- KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC);
+ KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ecf09b6243d9..df06f3da166c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
const unsigned char *file_name, u32 cookie)
{
struct fsnotify_iter_info iter_info = {};
- struct super_block *sb = NULL;
+ struct super_block *sb = to_tell->i_sb;
struct mount *mnt = NULL;
- __u32 mnt_or_sb_mask = 0;
+ __u32 mnt_or_sb_mask = sb->s_fsnotify_mask;
int ret = 0;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
if (data_is == FSNOTIFY_EVENT_PATH) {
mnt = real_mount(((const struct path *)data)->mnt);
- sb = mnt->mnt.mnt_sb;
- mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask;
+ mnt_or_sb_mask |= mnt->mnt_fsnotify_mask;
}
/* An event "on child" is not intended for a mount/sb mark */
if (mask & FS_EVENT_ON_CHILD)
@@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (!to_tell->i_fsnotify_marks &&
- (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks)))
+ if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
+ (!mnt || !mnt->mnt_fsnotify_marks))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
@@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
+ fsnotify_first_mark(&sb->s_fsnotify_marks);
if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
- iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
- fsnotify_first_mark(&sb->s_fsnotify_marks);
}
/*
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7e4578d35b61..74ae60305189 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -5,6 +5,7 @@
struct inotify_event_info {
struct fsnotify_event fse;
+ u32 mask;
int wd;
u32 sync_cookie;
int name_len;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f4184b4f3815..ff30abd6a49b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn,
{
struct inotify_event_info *old, *new;
- if (old_fsn->mask & FS_IN_IGNORED)
- return false;
old = INOTIFY_E(old_fsn);
new = INOTIFY_E(new_fsn);
- if ((old_fsn->mask == new_fsn->mask) &&
+ if (old->mask & FS_IN_IGNORED)
+ return false;
+ if ((old->mask == new->mask) &&
(old_fsn->inode == new_fsn->inode) &&
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
@@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group,
return -ENOMEM;
}
+ /*
+ * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
+ * for fanotify. inotify never reported IN_ISDIR with those events.
+ * It looks like an oversight, but to avoid the risk of breaking
+ * existing inotify programs, mask the flag out from those events.
+ */
+ if (mask & (IN_MOVE_SELF | IN_DELETE_SELF))
+ mask &= ~IN_ISDIR;
+
fsn_event = &event->fse;
- fsnotify_init_event(fsn_event, inode, mask);
+ fsnotify_init_event(fsn_event, inode);
+ event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
event->name_len = len;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 798f1253141a..e2901fbb9f76 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
*/
pad_name_len = round_event_name_len(fsn_event);
inotify_event.len = pad_name_len;
- inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+ inotify_event.mask = inotify_mask_to_arg(event->mask);
inotify_event.wd = event->wd;
inotify_event.cookie = event->sync_cookie;
@@ -634,7 +634,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
return ERR_PTR(-ENOMEM);
}
group->overflow_event = &oevent->fse;
- fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+ fsnotify_init_event(group->overflow_event, NULL);
+ oevent->mask = FS_Q_OVERFLOW;
oevent->wd = -1;
oevent->sync_cookie = 0;
oevent->name_len = 0;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d2dd16cb5989..d593d4269561 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -82,6 +82,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
+#include <linux/ratelimit.h>
#include <linux/atomic.h>
@@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int type)
+ unsigned int type,
+ __kernel_fsid_t *fsid)
{
struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
@@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
INIT_HLIST_HEAD(&conn->list);
conn->type = type;
conn->obj = connp;
+ /* Cache fsid of filesystem containing the object */
+ if (fsid)
+ conn->fsid = *fsid;
+ else
+ conn->fsid.val[0] = conn->fsid.val[1] = 0;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
inode = igrab(fsnotify_conn_inode(conn));
/*
@@ -544,7 +551,7 @@ out:
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
if (WARN_ON(!fsnotify_valid_obj_type(type)))
return -EINVAL;
+
+ /* Backend is expected to check for zero fsid (e.g. tmpfs) */
+ if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
+ return -ENODEV;
+
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, type);
+ err = fsnotify_attach_connector_to_object(connp, type, fsid);
if (err)
return err;
goto restart;
+ } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) &&
+ (fsid->val[0] != conn->fsid.val[0] ||
+ fsid->val[1] != conn->fsid.val[1])) {
+ /*
+ * Backend is expected to check for non uniform fsid
+ * (e.g. btrfs), but maybe we missed something?
+ * Only allow setting conn->fsid once to non zero fsid.
+ * inotify and non-fid fanotify groups do not set nor test
+ * conn->fsid.
+ */
+ pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
+ "%x.%x != %x.%x\n", __func__, conn->type,
+ fsid->val[0], fsid->val[1],
+ conn->fsid.val[0], conn->fsid.val[1]);
+ err = -EXDEV;
+ goto out_err;
}
/* is mark the first mark? */
@@ -606,7 +634,7 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
- int allow_dups)
+ int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
@@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
if (ret)
goto err;
@@ -648,13 +676,13 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int type, int allow_dups)
+ unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups);
+ ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
mutex_unlock(&group->mark_mutex);
return ret;
}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3c3e36745f59..5f3a54d444b5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
struct fsnotify_event *event)
{
/* Overflow events are per-group and we don't want to free them */
- if (!event || event->mask == FS_Q_OVERFLOW)
+ if (!event || event == group->overflow_event)
return;
/*
* If the event is still queued, we have a problem... Do an unreliable
@@ -141,6 +141,18 @@ queue:
return ret;
}
+void fsnotify_remove_queued_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
+{
+ assert_spin_locked(&group->notification_lock);
+ /*
+ * We need to init list head for the case of overflow event so that
+ * check in fsnotify_add_event() works
+ */
+ list_del_init(&event->list);
+ group->q_len--;
+}
+
/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
@@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
event = list_first_entry(&group->notification_list,
struct fsnotify_event, list);
- /*
- * We need to init list head for the case of overflow event so that
- * check in fsnotify_add_event() works
- */
- list_del_init(&event->list);
- group->q_len--;
-
+ fsnotify_remove_queued_event(group, event);
return event;
}
@@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
}
spin_unlock(&group->notification_lock);
}
-
-/*
- * fsnotify_create_event - Allocate a new event which will be sent to each
- * group's handle_event function if the group was interested in this
- * particular event.
- *
- * @inode the inode which is supposed to receive the event (sometimes a
- * parent of the inode to which the event happened.
- * @mask what actually happened.
- * @data pointer to the object which was actually affected
- * @data_type flag indication if the data is a file, path, inode, nothing...
- * @name the filename, if available
- */
-void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
- u32 mask)
-{
- INIT_LIST_HEAD(&event->list);
- event->inode = inode;
- event->mask = mask;
-}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1cbb27808e2..6f0999015a44 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
- struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- len = range->len >> osb->s_clustersize_bits;
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
-
- trace_ocfs2_trim_fs(start, len, minlen);
-
- ocfs2_trim_fs_lock_res_init(osb);
- ret = ocfs2_trim_fs_lock(osb, NULL, 1);
- if (ret < 0) {
- if (ret != -EAGAIN) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
goto out_unlock;
}
- mlog(ML_NOTICE, "Wait for trim on device (%s) to "
- "finish, which is running from another node.\n",
- osb->dev_str);
- ret = ocfs2_trim_fs_lock(osb, &info, 0);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
- goto out_unlock;
- }
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- if (info.tf_valid && info.tf_success &&
- info.tf_start == start && info.tf_len == len &&
- info.tf_minlen == minlen) {
- /* Avoid sending duplicated trim to a shared device */
- mlog(ML_NOTICE, "The same trim on device (%s) was "
- "just done from node (%u), return.\n",
- osb->dev_str, info.tf_nodenum);
- range->len = info.tf_trimlen;
- goto out_trimunlock;
- }
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
}
- info.tf_nodenum = osb->node_num;
- info.tf_start = start;
- info.tf_len = len;
- info.tf_minlen = minlen;
-
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
-
- trimmed = 0;
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
- info.tf_trimlen = range->len;
- info.tf_success = (ret ? 0 : 1);
- pinfo = &info;
-out_trimunlock:
- ocfs2_trim_fs_unlock(osb, pinfo);
- ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group)
+ goto next_group;
out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
+out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 7c835824247e..af405586c5b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
}
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4f86ac0027b5..1f029fbe8b8d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -407,6 +407,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 2ee76a90ba8f..dc4bce1649c1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d7407994f308..ea0756d83250 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -55,7 +55,7 @@ struct ocfs2_slot_info {
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[];
};
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3415e0b09398..96ae7cedd487 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (ocfs2_is_hard_readonly(osb))
goto leave;
+ mutex_init(&osb->obs_trim_fs_mutex);
+
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a5a2fe76568f..b094d3d79354 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -398,8 +398,6 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter
loff_t pos = iocb->ki_pos;
ssize_t rc = 0;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
orangefs_stats.reads++;
@@ -416,8 +414,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
loff_t pos;
ssize_t rc;
- BUG_ON(iocb->private);
-
gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
inode_lock(file->f_mapping->host);
diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..51d5fd8840ab 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9d428d5a0ac8..2edbb657f859 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
- seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
- seq_printf(m, "unknown");
+ seq_puts(m, "unknown");
break;
case PR_SPEC_NOT_AFFECTED:
- seq_printf(m, "not vulnerable");
+ seq_puts(m, "not vulnerable");
break;
case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
- seq_printf(m, "thread force mitigated");
+ seq_puts(m, "thread force mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
- seq_printf(m, "thread mitigated");
+ seq_puts(m, "thread mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
- seq_printf(m, "thread vulnerable");
+ seq_puts(m, "thread vulnerable");
break;
case PR_SPEC_DISABLE:
- seq_printf(m, "globally mitigated");
+ seq_puts(m, "globally mitigated");
break;
default:
- seq_printf(m, "vulnerable");
+ seq_puts(m, "vulnerable");
break;
}
seq_putc(m, '\n');
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633a63462573..5ab1849971b4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -140,9 +140,13 @@ struct pid_entry {
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show) \
- NOD(NAME, (S_IFREG|(MODE)), \
+ NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+#define ATTR(LSM, NAME, MODE) \
+ NOD(NAME, (S_IFREG|(MODE)), \
+ NULL, &proc_pid_attr_operations, \
+ { .lsm = LSM })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -456,7 +460,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
@@ -1086,10 +1090,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
task_lock(p);
if (!p->vfork_done && process_shares_mm(p, mm)) {
- pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
- task_pid_nr(p), p->comm,
- p->signal->oom_score_adj, oom_adj,
- task_pid_nr(task), task->comm);
p->signal->oom_score_adj = oom_adj;
if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
p->signal->oom_score_adj_min = (short)oom_adj;
@@ -1210,7 +1210,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
.llseek = default_llseek,
};
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
@@ -2525,7 +2525,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsm,
(char*)file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2574,7 +2574,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count);
+ rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ file->f_path.dentry->d_name.name, page,
+ count);
mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
kfree(page);
@@ -2588,13 +2590,53 @@ static const struct file_operations proc_pid_attr_operations = {
.llseek = generic_file_llseek,
};
+#define LSM_DIR_OPS(LSM) \
+static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ struct dir_context *ctx) \
+{ \
+ return proc_pident_readdir(filp, ctx, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ .read = generic_read_dir, \
+ .iterate = proc_##LSM##_attr_dir_iterate, \
+ .llseek = default_llseek, \
+}; \
+\
+static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ struct dentry *dentry, unsigned int flags) \
+{ \
+ return proc_pident_lookup(dir, dentry, \
+ LSM##_attr_dir_stuff, \
+ ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+} \
+\
+static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ .lookup = proc_##LSM##_attr_dir_lookup, \
+ .getattr = pid_getattr, \
+ .setattr = proc_setattr, \
+}
+
+#ifdef CONFIG_SECURITY_SMACK
+static const struct pid_entry smack_attr_dir_stuff[] = {
+ ATTR("smack", "current", 0666),
+};
+LSM_DIR_OPS(smack);
+#endif
+
static const struct pid_entry attr_dir_stuff[] = {
- REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("prev", S_IRUGO, proc_pid_attr_operations),
- REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
- REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+ ATTR(NULL, "current", 0666),
+ ATTR(NULL, "prev", 0444),
+ ATTR(NULL, "exec", 0666),
+ ATTR(NULL, "fscreate", 0666),
+ ATTR(NULL, "keycreate", 0666),
+ ATTR(NULL, "sockcreate", 0666),
+#ifdef CONFIG_SECURITY_SMACK
+ DIR("smack", 0555,
+ proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+#endif
};
static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
@@ -3002,7 +3044,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
@@ -3165,7 +3207,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
return d_splice_alias(inode, dentry);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
unsigned tgid;
@@ -3390,7 +3432,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
-#ifdef CONFIG_AUDITSYSCALL
+#ifdef CONFIG_AUDIT
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
#endif
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 95b14196f284..ea575375f210 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -82,6 +82,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
+ const char *lsm;
};
struct proc_inode {
@@ -162,7 +163,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 40b05e0d4274..544d1ee15aee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page)
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d2eca6..621e6ec322ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat,
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
return proc_lookup(dir, dentry, flags);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 127265e5c55f..57c0a1047250 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
d_add(self, inode);
+ ret = 0;
} else {
dput(self);
- self = ERR_PTR(-ENOMEM);
}
- } else {
- self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+ else
+ ns->proc_self = self;
+
+ return ret;
}
void __init proc_self_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 535eda7857cf..80c305f206bb 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -23,21 +23,21 @@
#ifdef arch_idle_time
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
idle += arch_idle_time(cpu);
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
if (cpu_online(cpu) && nr_iowait_cpu(cpu))
iowait += arch_idle_time(cpu);
return iowait;
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu)
#else
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu)
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu)
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
@@ -79,6 +79,31 @@ static u64 get_iowait_time(int cpu)
#endif
+static void show_irq_gap(struct seq_file *p, unsigned int gap)
+{
+ static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0";
+
+ while (gap > 0) {
+ unsigned int inc;
+
+ inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2);
+ seq_write(p, zeros, 2 * inc);
+ gap -= inc;
+ }
+}
+
+static void show_all_irqs(struct seq_file *p)
+{
+ unsigned int i, next = 0;
+
+ for_each_active_irq(i) {
+ show_irq_gap(p, i - next);
+ seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
+ next = i + 1;
+ }
+ show_irq_gap(p, nr_irqs - next);
+}
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -95,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+ user += kcs->cpustat[CPUTIME_USER];
+ nice += kcs->cpustat[CPUTIME_NICE];
+ system += kcs->cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(kcs, i);
+ iowait += get_iowait_time(kcs, i);
+ irq += kcs->cpustat[CPUTIME_IRQ];
+ softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal += kcs->cpustat[CPUTIME_STEAL];
+ guest += kcs->cpustat[CPUTIME_GUEST];
+ guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -130,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = kcs->cpustat[CPUTIME_USER];
+ nice = kcs->cpustat[CPUTIME_NICE];
+ system = kcs->cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(kcs, i);
+ iowait = get_iowait_time(kcs, i);
+ irq = kcs->cpustat[CPUTIME_IRQ];
+ softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal = kcs->cpustat[CPUTIME_STEAL];
+ guest = kcs->cpustat[CPUTIME_GUEST];
+ guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
@@ -156,9 +185,7 @@ static int show_stat(struct seq_file *p, void *v)
}
seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
- /* sum again ? it could be updated? */
- for_each_irq_nr(j)
- seq_put_decimal_ull(p, " ", kstat_irqs_usr(j));
+ show_all_irqs(p);
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f0ec9edab2f3..beccb0b1d57c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -423,7 +423,7 @@ struct mem_size_stats {
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty)
+ bool compound, bool young, bool dirty, bool locked)
{
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ if (locked)
+ mss->pss_locked += (u64)size << PSS_SHIFT;
return;
}
for (i = 0; i < nr; i++, page++) {
int mapcount = page_mapcount(page);
+ unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
if (mapcount >= 2) {
if (dirty || PageDirty(page))
mss->shared_dirty += PAGE_SIZE;
else
mss->shared_clean += PAGE_SIZE;
- mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ mss->pss += pss / mapcount;
+ if (locked)
+ mss->pss_locked += pss / mapcount;
} else {
if (dirty || PageDirty(page))
mss->private_dirty += PAGE_SIZE;
else
mss->private_clean += PAGE_SIZE;
- mss->pss += PAGE_SIZE << PSS_SHIFT;
+ mss->pss += pss;
+ if (locked)
+ mss->pss_locked += pss;
}
}
}
@@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
if (pte_present(*pte)) {
@@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page;
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
VM_BUG_ON_PAGE(1, page);
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma,
}
}
#endif
-
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
- if (vma->vm_flags & VM_LOCKED)
- mss->pss_locked += mss->pss;
}
#define SEQ_PUT_DEC(str, val) \
@@ -942,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 0b63d68dedb2..36bf0f2e102e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
else
bytes += kobjsize(current->files);
- if (current->sighand && atomic_read(&current->sighand->count) > 1)
+ if (current->sighand && refcount_read(&current->sighand->count) > 1)
sbytes += kobjsize(current->sighand);
else
bytes += kobjsize(current->sighand);
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
seq_file_path(m, file, "");
} else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b905010ca9eb..f61ae53533f5 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s)
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
d_add(thread_self, inode);
+ ret = 0;
} else {
dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+ else
+ ns->proc_thread_self = thread_self;
+
+ return ret;
}
void __init proc_thread_self_init(void)
diff --git a/fs/read_write.c b/fs/read_write.c
index ff3c5e6f87cf..30df848b7451 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -426,7 +426,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
ssize_t result;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
result = vfs_read(file, (void __user *)buf, count, pos);
set_fs(old_fs);
@@ -499,7 +499,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
return -EINVAL;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
@@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_write(file, (__force const char __user *)buf, count, pos);
set_fs(old_fs);
diff --git a/fs/select.c b/fs/select.c
index d0f35dbc0e8f..6cbc9ff56ba0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1379,7 +1379,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
struct old_timespec32 __user *, tsp, void __user *, sig)
{
@@ -1402,7 +1402,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
#endif
#if defined(CONFIG_COMPAT_32BIT_TIME)
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
diff --git a/fs/splice.c b/fs/splice.c
index de2ede048473..6489fb9436e4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -357,7 +357,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
ssize_t res;
old_fs = get_fs();
- set_fs(get_ds());
+ set_fs(KERNEL_DS);
/* The cast to a user pointer is valid due to the set_fs() */
res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);
@@ -1123,6 +1123,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (ipipe == opipe)
return -EINVAL;
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
return splice_pipe_to_pipe(ipipe, opipe, len, flags);
}
@@ -1148,6 +1151,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (unlikely(ret < 0))
return ret;
+ if (in->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
file_start_write(out);
ret = do_splice_from(ipipe, out, &offset, len, flags);
file_end_write(out);
@@ -1172,6 +1178,9 @@ static long do_splice(struct file *in, loff_t __user *off_in,
offset = in->f_pos;
}
+ if (out->f_flags & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
@@ -1717,6 +1726,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
* copying the data.
*/
if (ipipe && opipe && ipipe != opipe) {
+ if ((in->f_flags | out->f_flags) & O_NONBLOCK)
+ flags |= SPLICE_F_NONBLOCK;
+
/*
* Keep going, unless we encounter an error. The ipipe/opipe
* ordering doesn't really matter.
diff --git a/fs/statfs.c b/fs/statfs.c
index f0216629621d..eea7af6f2f22 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
return retval;
}
+int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+{
+ struct kstatfs st;
+ int error;
+
+ error = statfs_by_dentry(dentry, &st);
+ if (error)
+ return error;
+
+ *fsid = st.f_fsid;
+ return 0;
+}
+EXPORT_SYMBOL(vfs_get_fsid);
+
int vfs_statfs(const struct path *path, struct kstatfs *buf)
{
int error;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 51398457fe00..130fc6fbcc03 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -17,7 +17,6 @@
#include <linux/seq_file.h>
#include "sysfs.h"
-#include "../kernfs/kernfs-internal.h"
/*
* Determine ktype->sysfs_ops for the given kernfs_node. This function
@@ -497,6 +496,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
{
int i;
+
for (i = 0; ptr[i]; i++)
sysfs_remove_file(kobj, ptr[i]);
}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 803ca070d42e..6a6fc8aa1de7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags,
const struct old_itimerspec32 __user *, utmr,
struct old_itimerspec32 __user *, otmr)
{
@@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
return ret;
}
-COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+SYSCALL_DEFINE2(timerfd_gettime32, int, ufd,
struct old_itimerspec32 __user *, otmr)
{
struct itimerspec64 kotmr;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e3d684ea3203..ffd8038ff728 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
if (lvd->integritySeqExt.extLength)
udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
ret = 0;
+
+ if (!sbi->s_lvid_bh) {
+ /* We can't generate unique IDs without a valid LVID */
+ if (sb_rdonly(sb)) {
+ UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
+ } else {
+ udf_warn(sb, "Damaged or missing LVID, forcing "
+ "readonly mount\n");
+ ret = -EACCES;
+ }
+ }
out_bh:
brelse(bh);
return ret;
@@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return 0;
}
+static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
+{
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+ udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
+ lvid->descTag.descCRC = cpu_to_le16(
+ crc_itu_t(0, (char *)lvid + sizeof(struct tag),
+ le16_to_cpu(lvid->descTag.descCRCLength)));
+ lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+}
+
static void udf_open_lvid(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
else
UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb)
struct buffer_head *bh = sbi->s_lvid_bh;
struct logicalVolIntegrityDesc *lvid;
struct logicalVolIntegrityDescImpUse *lvidiu;
- struct timespec64 ts;
if (!bh)
return;
@@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb)
mutex_lock(&sbi->s_alloc_mutex);
lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
- ktime_get_real_ts64(&ts);
- udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb)
if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
- lvid->descTag.descCRC = cpu_to_le16(
- crc_itu_t(0, (char *)lvid + sizeof(struct tag),
- le16_to_cpu(lvid->descTag.descCRCLength)));
-
- lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
/*
* We set buffer uptodate unconditionally here to avoid spurious
* warnings from mark_buffer_dirty() when previous EIO has marked
* the buffer as !uptodate
*/
set_buffer_uptodate(bh);
+ udf_finalize_lvid(lvid);
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
@@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb)
if (!(++uniqueID & 0xFFFFFFFF))
uniqueID += 16;
lvhd->uniqueID = cpu_to_le64(uniqueID);
+ udf_updated_lvid(sb);
mutex_unlock(&sbi->s_alloc_mutex);
- mark_buffer_dirty(bh);
return ret;
}
@@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait)
mutex_lock(&sbi->s_alloc_mutex);
if (sbi->s_lvid_dirty) {
+ struct buffer_head *bh = sbi->s_lvid_bh;
+ struct logicalVolIntegrityDesc *lvid;
+
+ lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
+ udf_finalize_lvid(lvid);
+
/*
* Blockdevice will be synced later so we don't have to submit
* the buffer for IO
*/
- mark_buffer_dirty(sbi->s_lvid_bh);
+ mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
}
mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/utimes.c b/fs/utimes.c
index bdcf2daf39c1..350c9c16ace1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
* of sys_utimes.
*/
#ifdef __ARCH_WANT_SYS_UTIME32
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
- struct old_utimbuf32 __user *, t)
+SYSCALL_DEFINE2(utime32, const char __user *, filename,
+ struct old_utimbuf32 __user *, t)
{
struct timespec64 tv[2];
@@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
}
#endif
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
+SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
{
struct timespec64 tv[2];
@@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd,
+SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
const char __user *, filename,
struct old_timeval32 __user *, t)
{
return do_compat_futimesat(dfd, filename, t);
}
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t)
+SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
{
return do_compat_futimesat(AT_FDCWD, filename, t);
}
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 999ad8d00d43..1ef8acf35e7d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -339,14 +339,14 @@ xfs_ag_init_headers(
{ /* BNO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_cntbt_buf_ops,
.work = &xfs_cntroot_init,
.need_init = true
},
@@ -361,7 +361,7 @@ xfs_ag_init_headers(
{ /* FINO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_inobt_buf_ops,
+ .ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
.need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc36c06..e2ba2a3b63b2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -281,7 +281,7 @@ xfs_ag_resv_init(
*/
ask = used = 0;
- mp->m_inotbt_nores = true;
+ mp->m_finobt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
&used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b715668886a4..bc3367b8b7bb 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -568,9 +568,9 @@ xfs_agfl_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return NULL;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
return __this_address;
- if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -643,6 +643,7 @@ xfs_agfl_write_verify(
const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
.verify_struct = xfs_agfl_verify,
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
return __this_address;
}
- if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
.verify_struct = xfs_agf_verify,
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8a2802..9fe949f6055e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -297,48 +297,34 @@ xfs_allocbt_verify(
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
/*
- * magic number and level verification
- *
- * During growfs operations, we can't verify the exact level or owner as
- * the perag is not fully initialised and hence not attached to the
- * buffer. In this case, check against the maximum tree depth.
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
*
- * Similarly, during log recovery we will have a perag structure
- * attached, but the agf information will not yet have been initialised
- * from the on disk AGF. Again, we can only check against maximum limits
- * in this case.
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
*/
level = be16_to_cpu(block->bb_level);
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
return __this_address;
- break;
- case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
- return __this_address;
- break;
- default:
+ } else if (level >= mp->m_ag_maxlevels)
return __this_address;
- }
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
@@ -377,13 +363,23 @@ xfs_allocbt_write_verify(
}
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
- .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
.verify_struct = xfs_allocbt_verify,
};
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
STATIC int
xfs_bnobt_keys_inorder(
@@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_bnobt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_bnobt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_cntbt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_cntbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 844ed87b1900..2dd9ee2a2e08 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
xfs_da_state_free(state);
return retval;
}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index bdf52a333f3f..2297d8467666 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2652d00842d6..1f6e3965ff74 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -245,25 +245,14 @@ xfs_attr3_leaf_verify(
struct xfs_attr_leaf_entry *entries;
uint32_t end; /* must be 32bit - see below */
int i;
+ xfs_failaddr_t fa;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
- return __this_address;
- }
/*
* In recovery there is a transient state where count == 0 is valid
* because we may have transitioned an empty shortform attr to a leaf
@@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify(
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
.name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
.verify_struct = xfs_attr3_leaf_verify,
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c6b523..65ff600a8067 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok(
static xfs_failaddr_t
xfs_attr3_rmt_verify(
struct xfs_mount *mp,
+ struct xfs_buf *bp,
void *ptr,
int fsbsize,
xfs_daddr_t bno)
@@ -87,7 +88,7 @@ xfs_attr3_rmt_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify(
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
.verify_struct = xfs_attr3_rmt_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 332eefa2700b..48502cb9990f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -577,42 +577,44 @@ __xfs_bmap_add_free(
*/
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
*/
STATIC int /* error */
xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- /* REFERENCED */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
- struct xfs_ifork *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
struct xfs_owner_info oinfo;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
#ifdef DEBUG
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
xfs_btree_check_lptr(cur, cbno, 1));
@@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents(
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -2029,7 +2031,7 @@ done:
/*
* Convert an unwritten allocation to a real allocation or vice versa.
*/
-STATIC int /* error */
+int /* error */
xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
@@ -3685,17 +3687,6 @@ xfs_trim_extent(
}
}
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
- struct xfs_bmbt_irec *irec,
- struct xfs_inode *ip)
-
-{
- xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
- i_size_read(VFS_I(ip))));
-}
-
/*
* Trim the returned map to the required bounds
*/
@@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten(
return 0;
}
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
/*
* Map file blocks to filesystem blocks, and allocate blocks or convert the
* extent state if necessary. Details behaviour is controlled by the flags
@@ -4247,9 +4276,7 @@ xfs_bmapi_write(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(tp != NULL ||
- (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
- (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,25 +4309,12 @@ xfs_bmapi_write(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!tp || tp->t_firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- bma.minleft = 1;
- } else {
- bma.minleft = 0;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
goto error0;
}
- n = 0;
- end = bno + len;
- obno = bno;
-
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
@@ -4309,7 +4323,11 @@ xfs_bmapi_write(
bma.ip = ip;
bma.total = total;
bma.datatype = 0;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ n = 0;
+ end = bno + len;
+ obno = bno;
while (bno < end && n < *nmap) {
bool need_alloc = false, wasdelay = false;
@@ -4323,26 +4341,7 @@ xfs_bmapi_write(
ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
(flags & XFS_BMAPI_COWFORK)));
- if (flags & XFS_BMAPI_DELALLOC) {
- /*
- * For the COW fork we can reasonably get a
- * request for converting an extent that races
- * with other threads already having converted
- * part of it, as there converting COW to
- * regular blocks is not protected using the
- * IOLOCK.
- */
- ASSERT(flags & XFS_BMAPI_COWFORK);
- if (!(flags & XFS_BMAPI_COWFORK)) {
- error = -EIO;
- goto error0;
- }
-
- if (eof || bno >= end)
- break;
- } else {
- need_alloc = true;
- }
+ need_alloc = true;
} else if (isnullstartblock(bma.got.br_startblock)) {
wasdelay = true;
}
@@ -4351,8 +4350,7 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if ((need_alloc || wasdelay) &&
- !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4420,49 +4418,130 @@ xfs_bmapi_write(
}
*nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- ASSERT(bma.cur);
- error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
- &tmp_logflags, whichfork);
- bma.logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork));
- error = 0;
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
error0:
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmalloca bma = { NULL };
+ struct xfs_trans *tp;
+ int error;
+
/*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent records if we've converted to btree format.
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
*/
- if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- bma.logflags &= ~xfs_ilog_fext(whichfork);
- else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
+ }
+
/*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
*/
- if (bma.logflags)
- xfs_trans_log_inode(tp, ip, bma.logflags);
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ if (whichfork == XFS_COW_FORK)
+ bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
- if (bma.cur) {
- xfs_btree_del_cursor(bma.cur, error);
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
+ if (error)
+ goto out_finish;
}
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -4536,13 +4615,7 @@ xfs_bmapi_remap(
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -5406,24 +5479,11 @@ nodelete:
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- /*
- * transform from btree to extents, give it cur
- */
- else if (xfs_bmap_wants_extents(ip, whichfork)) {
- ASSERT(cur != NULL);
- error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
}
- /*
- * transform from extents to local?
- */
- error = 0;
+
error0:
/*
* Log everything. Do this after conversion, there's no point in
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 09d3ea97cc15..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC 0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
/* Skip online discard of freed extents */
#define XFS_BMAPI_NODISCARD 0x1000
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
-void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+ unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
static inline void
xfs_bmap_add_free(
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2e2a43..aff82ed112c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -416,8 +416,10 @@ xfs_bmbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -425,11 +427,6 @@ xfs_bmbt_verify(
fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_BMAP_MAGIC):
- break;
- default:
- return __this_address;
}
/*
@@ -481,6 +478,8 @@ xfs_bmbt_write_verify(
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
.name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
.verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee94b5dd..e2737e2ac2ae 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_zone_free(xfs_da_state_zone, state);
}
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static xfs_failaddr_t
xfs_da3_node_verify(
struct xfs_buf *bp)
@@ -124,27 +152,16 @@ xfs_da3_node_verify(
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
const struct xfs_dir_ops *ops;
+ xfs_failaddr_t fa;
ops = xfs_dir_get_ops(mp, NULL);
ops->node_hdr_from_disk(&ichdr, hdr);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_DA3_NODE_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_DA_NODE_MAGIC)
- return __this_address;
- }
if (ichdr.level == 0)
return __this_address;
if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -257,6 +274,8 @@ xfs_da3_node_verify_struct(
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
.verify_struct = xfs_da3_node_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3bffc78..ae654e06b2fb 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
}
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152cd1a24..156ce95c9c45 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -703,3 +703,20 @@ xfs_dir2_shrink_inode(
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b813d8..f54244779492 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed5919da72..b7d6d78f4ce2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -53,18 +53,16 @@ xfs_dir3_block_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -112,6 +110,8 @@ xfs_dir3_block_write_verify(
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
.verify_struct = xfs_dir3_block_verify,
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c62ec8f..b7b9ce002cb9 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -252,18 +252,16 @@ xfs_dir3_data_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -339,6 +337,8 @@ xfs_dir3_data_write_verify(
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
.name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e6f5cf..9a3767818c50 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int(
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_failaddr_t fa;
- ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- uint16_t magic3;
-
- magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
- : XFS_DIR3_LEAFN_MAGIC;
-
- if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
- return __this_address;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
- return __this_address;
- } else {
- if (leaf->hdr.info.magic != cpu_to_be16(magic))
- return __this_address;
- }
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
-__read_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
xfs_failaddr_t fa;
@@ -185,23 +166,22 @@ __read_verify(
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
static void
-__write_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -216,60 +196,22 @@ __write_verify(
xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
.name = "xfs_dir3_leaf1",
- .verify_read = xfs_dir3_leaf1_read_verify,
- .verify_write = xfs_dir3_leaf1_write_verify,
- .verify_struct = xfs_dir3_leaf1_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.name = "xfs_dir3_leafn",
- .verify_read = xfs_dir3_leafn_read_verify,
- .verify_write = xfs_dir3_leafn_write_verify,
- .verify_struct = xfs_dir3_leafn_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
int
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb3434f51c..3b03703c5c3d 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -87,20 +87,18 @@ xfs_dir3_free_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
- return __this_address;
}
/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -151,6 +149,8 @@ xfs_dir3_free_write_verify(
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
.name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
.verify_struct = xfs_dir3_free_verify,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f371dd54..fb5bd9a804f6 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify(
const struct xfs_buf_ops xfs_dquot_buf_ops = {
.name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
-#define XFS_ERRTAG_MAX 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d32152fc8a6c..fe9898875097 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2508,7 +2508,7 @@ xfs_agi_verify(
/*
* Validate the magic number of the agi block.
*/
- if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
return __this_address;
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return __this_address;
@@ -2582,6 +2582,7 @@ xfs_agi_write_verify(
const struct xfs_buf_ops xfs_agi_buf_ops = {
.name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
.verify_struct = xfs_agi_verify,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b25e7a0df47..1080381ff243 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -124,7 +124,7 @@ xfs_finobt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
@@ -154,7 +154,7 @@ xfs_finobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_free_block(cur, bp);
return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
}
@@ -260,6 +260,9 @@ xfs_inobt_verify(
xfs_failaddr_t fa;
unsigned int level;
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
/*
* During growfs operations, we can't verify the exact owner as the
* perag is not fully initialised and hence not attached to the buffer.
@@ -270,18 +273,10 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_IBT_CRC_MAGIC):
- case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_IBT_MAGIC):
- case cpu_to_be32(XFS_FIBT_MAGIC):
- break;
- default:
- return __this_address;
}
/* level verification */
@@ -328,6 +323,16 @@ xfs_inobt_write_verify(
const struct xfs_buf_ops xfs_inobt_buf_ops = {
.name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
.verify_struct = xfs_inobt_verify,
@@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
- .buf_ops = &xfs_inobt_buf_ops,
+ .buf_ops = &xfs_finobt_buf_ops,
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd072015d..bc690f2409fa 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -614,16 +614,15 @@ xfs_iext_realloc_root(
}
/*
- * Increment the sequence counter if we are on a COW fork. This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed. We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
*/
static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
{
- if (state & BMAP_COWFORK)
- WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8cfa4a0..e021d5133ccb 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -97,10 +97,9 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- (unlinked_ino == NULLAGINO ||
- xfs_verify_agino(mp, agno, unlinked_ino));
+ xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -147,12 +146,16 @@ xfs_inode_buf_write_verify(
const struct xfs_buf_ops xfs_inode_buf_ops = {
.name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
- .name = "xxfs_inode_ra",
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2d74a1..00c62ce170d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@ struct xfs_dinode;
*/
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* cow fork mod counter */
+ unsigned int if_seq; /* fork mod counter */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d9eab657b63e..6f47ab876d90 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -209,7 +209,7 @@ xfs_refcountbt_verify(
xfs_failaddr_t fa;
unsigned int level;
- if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +264,7 @@ xfs_refcountbt_write_verify(
const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
.verify_read = xfs_refcountbt_read_verify,
.verify_write = xfs_refcountbt_write_verify,
.verify_struct = xfs_refcountbt_verify,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf040d745..5738e11055e6 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -310,7 +310,7 @@ xfs_rmapbt_verify(
* from the on disk AGF. Again, we can only check against maximum limits
* in this case.
*/
- if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +365,7 @@ xfs_rmapbt_write_verify(
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
.verify_read = xfs_rmapbt_read_verify,
.verify_write = xfs_rmapbt_write_verify,
.verify_struct = xfs_rmapbt_verify,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b5a82acd7dfe..77a3a4085de3 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -225,10 +225,11 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
uint32_t agcount = 0;
uint32_t rem;
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
}
@@ -781,12 +782,14 @@ out_error:
const struct xfs_buf_ops xfs_sb_buf_ops = {
.name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
@@ -874,7 +877,7 @@ xfs_initialize_perag_data(
uint64_t bfreelst = 0;
uint64_t btree = 0;
uint64_t fdblocks;
- int error;
+ int error = 0;
for (index = 0; index < agcount; index++) {
/*
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe748f0..4e909791aeac 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 77d80106f989..a0ccc253c43d 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -95,7 +95,7 @@ xfs_symlink_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -159,6 +159,7 @@ xfs_symlink_write_verify(
const struct xfs_buf_ops xfs_symlink_buf_ops = {
.name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
.verify_struct = xfs_symlink_verify,
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 3306fc42cfad..de310712dd6d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -116,6 +116,19 @@ xfs_verify_agino(
}
/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -204,3 +217,14 @@ xfs_verify_icount(
xfs_icount_range(mp, &min, &max);
return icount >= min && icount <= max;
}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8f02855a019a..c5a25403b4db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t *first, xfs_agino_t *last);
bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90955ab1e895..ddf06bfaa29d 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -399,7 +399,7 @@ xchk_agf_xref_cntbt(
if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
return;
if (!have) {
- if (agf->agf_freeblks != be32_to_cpu(0))
+ if (agf->agf_freeblks != cpu_to_be32(0))
xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
return;
}
@@ -864,19 +864,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (agino == NULLAGINO)
- continue;
- if (!xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 03d1e15cceba..64e31f87d490 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -341,23 +341,19 @@ xrep_agf(
struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTB_CRC_MAGIC,
+ .buf_ops = &xfs_bnobt_buf_ops,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTC_CRC_MAGIC,
+ .buf_ops = &xfs_cntbt_buf_ops,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
- .magic = XFS_RMAP_CRC_MAGIC,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
- .magic = XFS_REFC_CRC_MAGIC,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -875,12 +871,10 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_IBT_CRC_MAGIC,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_FIBT_CRC_MAGIC,
+ .buf_ops = &xfs_finobt_buf_ops,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90547a1..dce74ec57038 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -82,12 +82,23 @@ xchk_xattr_listent(
sx = container_of(context, struct xchk_xattr, context);
+ if (xchk_should_terminate(sx->sc, &error)) {
+ context->seen_enough = 1;
+ return;
+ }
+
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ return;
+ }
+
args.flags = ATTR_KERNOTIME;
if (flags & XFS_ATTR_ROOT)
args.flags |= ATTR_ROOT;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3223e3..a703cd58a90e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -281,6 +281,31 @@ xchk_bmap_extent_xref(
xchk_ag_free(info->sc, &info->sc->sa);
}
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+ return;
+
+ if (!xfs_verify_dablk(mp, irec->br_startoff))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ off = irec->br_startoff + irec->br_blockcount - 1;
+ if (!xfs_verify_dablk(mp, off))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
/* Scrub a single extent record. */
STATIC int
xchk_bmap_extent(
@@ -305,6 +330,8 @@ xchk_bmap_extent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ xchk_bmap_dirattr_extent(ip, info, irec);
+
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d768a18..a38a22785a1a 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -129,6 +129,12 @@ xchk_dir_actor(
goto out;
}
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 882dc56c5c21..700114f79a7d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt(
struct xchk_iallocbt {
/* Number of inodes we see while scanning inobt. */
unsigned long long inodes;
+
+ /* Expected next startino, for big block filesystems. */
+ xfs_agino_t next_startino;
+
+ /* Expected end of the current inode cluster. */
+ xfs_agino_t next_cluster_ino;
};
/*
@@ -128,41 +134,57 @@ xchk_iallocbt_freecount(
return hweight64(freemask);
}
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record. First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
struct xchk_btree *bs,
- xfs_ino_t fsino,
- xfs_agino_t chunkino,
- xfs_agino_t clusterino,
struct xfs_inobt_rec_incore *irec,
- struct xfs_buf *bp)
+ unsigned int irec_ino,
+ struct xfs_dinode *dip)
{
- struct xfs_dinode *dip;
struct xfs_mount *mp = bs->cur->bc_mp;
- bool inode_is_free = false;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ bool irec_free;
+ bool ino_inuse;
bool freemask_ok;
- bool inuse;
int error = 0;
if (xchk_should_terminate(bs->sc, &error))
return error;
- dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ /*
+ * Given an inobt record and the offset of an inode from the start of
+ * the record, compute which fs inode we're talking about.
+ */
+ agino = irec->ir_startino + irec_ino;
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- (dip->di_version >= 3 &&
- be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
- if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
- inode_is_free = true;
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
- fsino + clusterino, &inuse);
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+ &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
- freemask_ok = inode_is_free ^ !!(dip->di_mode);
+ freemask_ok = irec_free ^ !!(dip->di_mode);
if (!bs->sc->try_harder && !freemask_ok)
return -EDEADLOCK;
} else if (error < 0) {
@@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask(
goto out;
} else {
/* Inode is all there. */
- freemask_ok = inode_is_free ^ inuse;
+ freemask_ok = irec_free ^ ino_inuse;
}
if (!freemask_ok)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +204,221 @@ out:
return 0;
}
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk. If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
struct xchk_btree *bs,
- struct xfs_inobt_rec_incore *irec)
+ struct xfs_inobt_rec_incore *irec,
+ unsigned int cluster_base)
{
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_dinode *dip;
- struct xfs_buf *bp;
- xfs_ino_t fsino;
- xfs_agino_t nr_inodes;
- xfs_agino_t agino;
- xfs_agino_t chunkino;
- xfs_agino_t clusterino;
+ struct xfs_buf *cluster_bp;
+ unsigned int nr_inodes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t agbno;
- uint16_t holemask;
+ unsigned int cluster_index;
+ uint16_t cluster_mask = 0;
uint16_t ir_holemask;
int error = 0;
- /* Make sure the freemask matches the inode records. */
- nr_inodes = mp->m_inodes_per_cluster;
-
- for (agino = irec->ir_startino;
- agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
- agino += mp->m_inodes_per_cluster) {
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
- chunkino = agino - irec->ir_startino;
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
- /* Compute the holemask mask for this cluster. */
- for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
- clusterino += XFS_INODES_PER_HOLEMASK_BIT)
- holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
- XFS_INODES_PER_HOLEMASK_BIT);
-
- /* The whole cluster must be a hole or not a hole. */
- ir_holemask = (irec->ir_holemask & holemask);
- if (ir_holemask != holemask && ir_holemask != 0) {
+ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_inodes_per_cluster);
+
+ /* Map this inode cluster */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+ /* Compute a bitmask for this cluster that can be used for holemask. */
+ for (cluster_index = 0;
+ cluster_index < nr_inodes;
+ cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+ cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /*
+ * Map the first inode of this cluster to a buffer and offset.
+ * Be careful about inobt records that don't align with the start of
+ * the inode buffer when block sizes are large enough to hold multiple
+ * inode chunks. When this happens, cluster_base will be zero but
+ * ir_startino can be large enough to make im_boffset nonzero.
+ */
+ ir_holemask = (irec->ir_holemask & cluster_mask);
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+ imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino);
+
+ if (imap.im_boffset != 0 && cluster_base != 0) {
+ ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+ imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+ cluster_mask, ir_holemask,
+ XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+ cluster_base));
+
+ /* The whole cluster must be a hole or not a hole. */
+ if (ir_holemask != cluster_mask && ir_holemask != 0) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xchk_xref_is_not_owned_by(bs->sc, agbno,
+ mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+ return 0;
+ }
+
+ xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+
+ /* Grab the inode cluster buffer. */
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+ 0, 0);
+ if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+ return error;
+
+ /* Check free status of each inode within this cluster. */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ struct xfs_dinode *dip;
+
+ if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- continue;
+ break;
}
- /* If any part of this is a hole, skip it. */
- if (ir_holemask) {
- xchk_xref_is_not_owned_by(bs->sc, agbno,
- mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
- continue;
+ dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+ error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+ cluster_base + cluster_index, dip);
+ if (error)
+ break;
+ imap.im_boffset += mp->m_sb.sb_inodesize;
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+ return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ unsigned int cluster_base;
+ int error = 0;
+
+ /*
+ * For the common case where this inobt record maps to multiple inode
+ * clusters this will call _check_cluster for each cluster.
+ *
+ * For the case that multiple inobt records map to a single cluster,
+ * this will call _check_cluster once.
+ */
+ for (cluster_base = 0;
+ cluster_base < XFS_INODES_PER_CHUNK;
+ cluster_base += bs->sc->mp->m_inodes_per_cluster) {
+ error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly. Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk. This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = bs->sc->mp;
+ struct xchk_iallocbt *iabt = bs->private;
+
+ /*
+ * finobt records have different positioning requirements than inobt
+ * records: each finobt record must have a corresponding inobt record.
+ * That is checked in the xref function, so for now we only catch the
+ * obvious case where the record isn't at all aligned properly.
+ *
+ * Note that if a fs block contains more than a single chunk of inodes,
+ * we will have finobt records only for those chunks containing free
+ * inodes, and therefore expect chunk alignment of finobt records.
+ * Otherwise, we expect that the finobt record is aligned to the
+ * cluster alignment as told by the superblock.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ unsigned int imask;
+
+ imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ mp->m_cluster_align_inodes) - 1;
+ if (irec->ir_startino & imask)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (iabt->next_startino != NULLAGINO) {
+ /*
+ * We're midway through a cluster of inodes that is mapped by
+ * multiple inobt records. Did we get the record for the next
+ * irec in the sequence?
+ */
+ if (irec->ir_startino != iabt->next_startino) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
+ iabt->next_startino += XFS_INODES_PER_CHUNK;
- /* Grab the inode cluster buffer. */
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
- agbno);
- imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
- imap.im_boffset = 0;
-
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
- &dip, &bp, 0, 0);
- if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
- &error))
- continue;
-
- /* Which inodes are free? */
- for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
- error = xchk_iallocbt_check_cluster_freemask(bs,
- fsino, chunkino, clusterino, irec, bp);
- if (error) {
- xfs_trans_brelse(bs->cur->bc_tp, bp);
- return error;
- }
+ /* Are we done with the cluster? */
+ if (iabt->next_startino >= iabt->next_cluster_ino) {
+ iabt->next_startino = NULLAGINO;
+ iabt->next_cluster_ino = NULLAGINO;
}
+ return;
+ }
+
+ /* inobt records must be aligned to cluster and inoalignmnt size. */
+ if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
- xfs_trans_brelse(bs->cur->bc_tp, bp);
+ if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- return error;
+ if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+ return;
+
+ /*
+ * If this is the start of an inode cluster that can be mapped by
+ * multiple inobt records, the next inobt record must follow exactly
+ * after this one.
+ */
+ iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+ iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster;
}
/* Scrub an inobt/finobt record. */
@@ -276,7 +433,6 @@ xchk_iallocbt_rec(
uint64_t holes;
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
- xfs_agblock_t agbno;
xfs_extlen_t len;
int holecount;
int i;
@@ -303,11 +459,9 @@ xchk_iallocbt_rec(
goto out;
}
- /* Make sure this record is aligned to cluster and inoalignmnt size. */
- agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
- if ((agbno & (mp->m_cluster_align - 1)) ||
- (agbno & (mp->m_blocks_per_cluster - 1)))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_iallocbt_rec_alignment(bs, &irec);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
iabt->inodes += irec.ir_count;
@@ -320,7 +474,7 @@ xchk_iallocbt_rec(
if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
goto out;
- goto check_freemask;
+ goto check_clusters;
}
/* Check each chunk of a sparse inode cluster. */
@@ -346,8 +500,8 @@ xchk_iallocbt_rec(
holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-check_freemask:
- error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+ error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -429,6 +583,8 @@ xchk_iallocbt(
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
.inodes = 0,
+ .next_startino = NULLAGINO,
+ .next_cluster_ino = NULLAGINO,
};
int error;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 6acf1bfa0bfe..f28f4bad317b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -743,7 +743,8 @@ xrep_findroot_block(
/* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
- if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ ASSERT(fab->buf_ops->magic[1] != 0);
+ if (btblock->bb_magic != fab->buf_ops->magic[1])
goto out;
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f2fc18bb7605..d990314eb08b 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
- /* in: magic number of the btree */
- uint32_t magic;
-
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bbb17cc..dbe115b075f7 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space(
startext = fsbno;
endext = fsbno + len - 1;
do_div(startext, sc->mp->m_sb.sb_rextsize);
- if (do_div(endext, sc->mp->m_sb.sb_rextsize))
- endext++;
- extcount = endext - startext;
+ do_div(endext, sc->mp->m_sb.sb_rextsize);
+ extcount = endext - startext + 1;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 8344b14031ef..3c83e8b3b39c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error,
__entry->ret_ip)
);
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, xfs_daddr_t map_daddr,
+ unsigned short map_len, unsigned int chunk_ino,
+ unsigned int nr_inodes, uint16_t cluster_mask,
+ uint16_t holemask, unsigned int cluster_ino),
+ TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ cluster_mask, holemask, cluster_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(xfs_daddr_t, map_daddr)
+ __field(unsigned short, map_len)
+ __field(unsigned int, chunk_ino)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, cluster_ino)
+ __field(uint16_t, cluster_mask)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->map_daddr = map_daddr;
+ __entry->map_len = map_len;
+ __entry->chunk_ino = chunk_ino;
+ __entry->nr_inodes = nr_inodes;
+ __entry->cluster_mask = cluster_mask;
+ __entry->holemask = holemask;
+ __entry->cluster_ino = cluster_ino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->map_daddr,
+ __entry->map_len,
+ __entry->chunk_ino,
+ __entry->nr_inodes,
+ __entry->cluster_mask,
+ __entry->holemask,
+ __entry->cluster_ino)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 55f3e194a852..3619e9e8d359 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,7 +28,8 @@
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
@@ -256,30 +257,20 @@ xfs_end_io(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
@@ -294,7 +285,8 @@ xfs_end_bio(
struct xfs_ioend *ioend = bio->bi_private;
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (ioend->io_append_trans)
queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -302,6 +294,75 @@ xfs_end_bio(
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
@@ -311,26 +372,16 @@ xfs_map_blocks(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
@@ -347,31 +398,19 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -383,30 +422,16 @@ xfs_map_blocks(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -418,51 +443,65 @@ xfs_map_blocks(
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
@@ -487,7 +526,7 @@ xfs_submit_ioend(
int status)
{
/* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
/*
* Yuk. This can do memory allocation, but is not a
* transactional operation so everything is done in GFP_KERNEL
@@ -505,7 +544,8 @@ xfs_submit_ioend(
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
@@ -534,7 +574,8 @@ xfs_submit_ioend(
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
sector_t sector)
@@ -548,7 +589,8 @@ xfs_alloc_ioend(
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
@@ -609,13 +651,15 @@ xfs_add_to_ioend(
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector);
}
if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
@@ -724,7 +768,7 @@ xfs_writepage_map(
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
@@ -919,9 +963,7 @@ xfs_vm_writepage(
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
@@ -935,9 +977,7 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -984,7 +1024,7 @@ xfs_vm_bmap(
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e5c23948a8ab..6c2615b83c5d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,32 +9,12 @@
extern struct bio_set xfs_ioend_bioset;
/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- *
- * This enum is used in string mapping in xfs_trace.h; please keep the
- * TRACE_DEFINE_ENUMs for it up to date.
- */
-enum {
- XFS_IO_HOLE, /* covers region without any block allocation */
- XFS_IO_DELALLOC, /* covers delalloc region */
- XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
- XFS_IO_OVERWRITE, /* covers already allocated extent */
- XFS_IO_COW, /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
- { XFS_IO_HOLE, "hole" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }
-
-/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
- unsigned int io_type; /* delalloc / unwritten */
+ int io_fork; /* inode fork written back */
+ xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a58034049995..3d213a7394c5 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -555,6 +555,7 @@ xfs_attr_put_listent(
attrlist_ent_t *aep;
int arraytop;
+ ASSERT(!context->seen_enough);
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1ee8c5539fa4..2db43ff4f8b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1162,16 +1162,13 @@ xfs_zero_file_space(
* by virtue of the hole punch.
*/
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out;
+ if (error || xfs_is_always_cow_inode(ip))
+ return error;
- error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ return xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
-out:
- return error;
-
}
static int
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4f5f2ff3f70f..548344e25128 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -776,29 +776,24 @@ _xfs_buf_read(
}
/*
- * Set buffer ops on an unchecked buffer and validate it, if possible.
+ * Reverify a buffer found in cache without an attached ->b_ops.
*
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents. If the contents
- * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
*
- * Under normal operations, every in-core buffer must have buffer ops assigned
- * to them when the buffer is read in from disk so that we can validate the
- * metadata.
- *
- * However, there are two scenarios where one can encounter in-core buffers
- * that don't have buffer ops. The first is during log recovery of buffers on
- * a V4 filesystem, though these buffers are purged at the end of recovery.
- *
- * The other is online repair, which tries to match arbitrary metadata blocks
- * with btree types in order to find the root. If online repair doesn't match
- * the buffer with /any/ btree type, the buffer remains in memory in DONE state
- * with no ops, and a subsequent read_buf call from elsewhere will not set the
- * ops. This function helps us fix this situation.
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type. If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
*/
int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
struct xfs_buf *bp,
const struct xfs_buf_ops *ops)
{
@@ -840,7 +835,7 @@ xfs_buf_read_map(
return bp;
}
- xfs_buf_ensure_ops(bp, ops);
+ xfs_buf_reverify(bp, ops);
if (flags & XBF_ASYNC) {
/*
@@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
atomic_set(&bp->b_lru_ref, lru_ref);
}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+ struct xfs_buf *bp,
+ __be32 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+ struct xfs_buf *bp,
+ __be16 dmagic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511ea998..d0b96e071cec 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -125,6 +125,10 @@ struct xfs_buf_map {
struct xfs_buf_ops {
char *name;
+ union {
+ __be32 magic[2]; /* v4 and v5 on disk magic values */
+ __be16 magic16[2]; /* v4 and v5 on disk magic values */
+ };
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..a1e177f66404 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
+ XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
@@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
+ XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
@@ -357,7 +360,8 @@ xfs_buf_verifier_error(
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
fa, bp->b_ops->name, bp->b_bn, name);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..602aa7d62b66 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+#define XFS_PTAG_VERIFIER_ERROR 0x00000100
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 60c2da41f0fc..1f2e2845eb76 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
@@ -872,14 +872,27 @@ xfs_file_fallocate(
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
}
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
@@ -1068,10 +1081,10 @@ xfs_file_llseek(
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f3ef70c542e1..584648582ba7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks(
int error = 0;
int err2;
+ mp->m_finobt_nores = false;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84ae382..d0d377384120 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -16,7 +16,7 @@ xfs_param_t xfs_params = {
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 255 },
+ .panic_mask = { 0, 0, 256 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae667ba74a1c..f643a9295179 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1332,7 +1332,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1754,7 +1754,7 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- if (unlikely(mp->m_inotbt_nores)) {
+ if (unlikely(mp->m_finobt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
@@ -1907,86 +1907,510 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk. The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+ struct rhash_head iu_rhash_head;
+ xfs_agino_t iu_agino; /* X */
+ xfs_agino_t iu_next_unlinked; /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const xfs_agino_t *key = arg->key;
+ const struct xfs_iunlink *iu = obj;
+
+ if (iu->iu_next_unlinked != *key)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
+ .key_len = sizeof(xfs_agino_t),
+ .key_offset = offsetof(struct xfs_iunlink,
+ iu_next_unlinked),
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ struct xfs_iunlink *iu;
+
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+ struct xfs_perag *pag,
+ struct xfs_iunlink *iu)
+{
+ int error;
+
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ /*
+ * Fail loudly if there already was an entry because that's a sign of
+ * corruption of in-memory data. Also fail loudly if we see an error
+ * code we didn't anticipate from the rhashtable code. Currently we
+ * only anticipate ENOMEM.
+ */
+ if (error) {
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+ kmem_free(iu);
+ }
+ /*
+ * Absorb any runtime errors that aren't a result of corruption because
+ * this is a cache and we can always fall back to bucket list scanning.
+ */
+ if (error != 0 && error != -EEXIST)
+ error = 0;
+ return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t this_agino)
+{
+ struct xfs_iunlink *iu;
+
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+ return 0;
+
+ iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+ iu->iu_agino = prev_agino;
+ iu->iu_next_unlinked = this_agino;
+
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agino_t next_unlinked)
+{
+ struct xfs_iunlink *iu;
+ int error;
+
+ /* Look up the old entry; if there wasn't one then exit. */
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ if (!iu)
+ return 0;
+
+ /*
+ * Remove the entry. This shouldn't ever return an error, but if we
+ * couldn't remove the old entry we don't want to add it again to the
+ * hash table, and if the entry disappeared on us then someone's
+ * violated the locking rules and we need to fail loudly. Either way
+ * we cannot remove the inode because internal state is or would have
+ * been corrupt.
+ */
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ if (error)
+ return error;
+
+ /* If there is no new next entry just free our item and return. */
+ if (next_unlinked == NULLAGINO) {
+ kmem_free(iu);
+ return 0;
+ }
+
+ /* Update the entry and re-add it to the hash table. */
+ iu->iu_next_unlinked = next_unlinked;
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+ struct xfs_perag *pag)
+{
+ return rhashtable_init(&pag->pagi_unlinked_hash,
+ &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+ void *ptr,
+ void *arg)
+{
+ struct xfs_iunlink *iu = ptr;
+ bool *freed_anything = arg;
+
+ *freed_anything = true;
+ kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+ struct xfs_perag *pag)
+{
+ bool freed_anything = false;
+
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+ xfs_iunlink_free_item, &freed_anything);
+
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino)
+ return -EFSCORRUPTED;
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_buf *ibp,
+ struct xfs_dinode *dip,
+ struct xfs_imap *imap,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = imap->im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_agnumber_t agno,
+ xfs_agino_t next_agino,
+ xfs_agino_t *old_next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ xfs_agino_t old_value;
+ int error;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ if (error)
+ return error;
+
+ /* Make sure the old pointer isn't garbage. */
+ old_value = be32_to_cpu(dip->di_next_unlinked);
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ *old_next_agino = old_value;
+ if (old_value == next_agino) {
+ if (next_agino != NULLAGINO)
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Ok, update the new pointer. */
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ ibp, dip, &ip->i_imap, next_agino);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
*
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = tp->t_mountp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agino_t agino;
- short bucket_index;
- int offset;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t next_agino;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- ASSERT(agino != 0);
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(agi->agi_unlinked[bucket_index]);
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(mp, agno, next_agino))
+ return -EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ struct xfs_perag *pag;
+ xfs_agino_t old_agino;
+
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ &old_agino);
+ if (error)
+ return error;
+ ASSERT(old_agino == NULLAGINO);
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
/*
- * There is already another inode in the bucket we need
- * to add ourselves to. Add us at the front of the list.
- * Here we put the head pointer into our next pointer,
- * and then we fall through to point the head at us.
+ * agino has been unlinked, add a backref from the next inode
+ * back to agino.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
+ xfs_perag_put(pag);
if (error)
return error;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
+ imap->im_blkno = 0;
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino. Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t head_agino,
+ xfs_agino_t target_agino,
+ xfs_agino_t *agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agino_t next_agino;
+ int error;
+
+ ASSERT(head_agino != target_agino);
+ *bpp = NULL;
+
+ /* See if our backref cache can find it faster. */
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+ if (*agino != NULLAGINO) {
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+ if (error)
+ return error;
+
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+ return 0;
+
+ /*
+ * If we get here the cache contents were corrupt, so drop the
+ * buffer and fall back to walking the bucket list.
+ */
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ WARN_ON_ONCE(1);
+ }
+
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+ /* Otherwise, walk the entire bucket until we find it. */
+ next_agino = head_agino;
+ while (next_agino != target_agino) {
+ xfs_agino_t unlinked_agino;
+
+ if (*bpp)
+ xfs_trans_brelse(tp, *bpp);
+
+ *agino = next_agino;
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+ bpp);
+ if (error)
+ return error;
+
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+ /*
+ * Make sure this pointer is valid and isn't an obvious
+ * infinite loop.
+ */
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ next_agino == unlinked_agino) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp,
+ *dipp, sizeof(**dipp));
+ error = -EFSCORRUPTED;
+ return error;
+ }
+ next_agino = unlinked_agino;
}
- /*
- * Point the bucket head pointer at the inode being inserted.
- */
- ASSERT(agino != 0);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
return 0;
}
@@ -1995,181 +2419,106 @@ xfs_iunlink(
*/
STATIC int
xfs_iunlink_remove(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_ino_t next_ino;
- xfs_mount_t *mp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_agino_t agino;
- xfs_agino_t next_agino;
- xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip = NULL;
- short bucket_index;
- int offset, last_offset = 0;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ struct xfs_buf *last_ibp;
+ struct xfs_dinode *last_dip = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t next_agino;
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
- mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ trace_xfs_iunlink_remove(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
-
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- if (!xfs_verify_agino(mp, agno, agino))
- return -EFSCORRUPTED;
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- if (!xfs_verify_agino(mp, agno,
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
}
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
- /*
- * We're at the head of the list. Get the inode's on-disk
- * buffer to see if there is anyone after us on the list.
- * Only modify our next pointer if it is not already NULLAGINO.
- * This saves us the overhead of dealing with the buffer when
- * there is no need to change it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the bucket head pointer at the next inode.
- */
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- } else {
- /*
- * We need to search the list for the inode being freed.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- last_ibp = NULL;
- while (next_agino != agino) {
- struct xfs_imap imap;
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ if (error)
+ return error;
- if (last_ibp)
- xfs_trans_brelse(tp, last_ibp);
+ /*
+ * If there was a backref pointing from the next inode back to this
+ * one, remove it because we've removed this inode from the list.
+ *
+ * Later, if this inode was in the middle of the list we'll update
+ * this inode's backref to point from the next inode.
+ */
+ if (next_agino != NULLAGINO) {
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_change_backref(pag, next_agino,
+ NULLAGINO);
+ if (error)
+ goto out;
+ }
- imap.im_blkno = 0;
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+ if (head_agino == agino) {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+ if (error)
+ goto out;
+ } else {
+ struct xfs_imap imap;
+ xfs_agino_t prev_agino;
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
+ if (!pag)
+ pag = xfs_perag_get(mp, agno);
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
- &last_ibp, 0, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
+ /* We need to search the list for the inode being freed. */
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp,
+ pag);
+ if (error)
+ goto out;
- last_offset = imap.im_boffset;
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
- if (!xfs_verify_agino(mp, agno, next_agino)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- last_dip, sizeof(*last_dip));
- return -EFSCORRUPTED;
- }
- }
+ /* Point the previous inode on the list to the next inode. */
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ last_dip, &imap, next_agino);
/*
- * Now last_ibp points to the buffer previous to us on the
- * unlinked list. Pull us from the list.
+ * Now we deal with the backref for this inode. If this inode
+ * pointed at a real inode, change the backref that pointed to
+ * us to point to our old next. If this inode was the end of
+ * the list, delete the backref that pointed to us. Note that
+ * change_backref takes care of deleting the backref if
+ * next_agino is NULLAGINO.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the previous inode on the list to the next inode.
- */
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
- ASSERT(next_agino != 0);
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, last_dip);
-
- xfs_trans_inode_buf(tp, last_ibp);
- xfs_trans_log_buf(tp, last_ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, last_ibp);
+ error = xfs_iunlink_change_backref(pag, agino, next_agino);
+ if (error)
+ goto out;
}
- return 0;
+
+out:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout(
/*
* Prepare the tmpfile inode as if it were created through the VFS.
- * Otherwise, the link increment paths will complain about nlink 0->1.
- * Drop the link count as done by d_tmpfile(), complete the inode setup
- * and flag it as linkable.
+ * Complete the inode setup and flag it as linkable. nlink is already
+ * zero, so we can skip the drop_nlink.
*/
- drop_nlink(VFS_I(tmpfile));
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..e62074a5257c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone;
bool xfs_inode_verify_forks(struct xfs_inode *ip);
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 27c93b5f029d..63d323916bba 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -35,18 +35,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-void
+static int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return -EFSCORRUPTED;
+}
+
+int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool shared)
{
struct xfs_mount *mp = ip->i_mount;
+ if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(imap->br_startblock)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
@@ -60,6 +82,13 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
+ return 0;
}
static void
@@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb(
return 0;
}
-STATIC int
-xfs_alert_fsblock_zero(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *imap)
-{
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)imap->br_startblock,
- (unsigned long long)imap->br_startoff,
- (unsigned long long)imap->br_blockcount,
- imap->br_state);
- return -EFSCORRUPTED;
-}
-
int
xfs_iomap_write_direct(
xfs_inode_t *ip,
@@ -383,12 +395,13 @@ xfs_quota_calc_throttle(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_inode *ip,
+ int whichfork,
loff_t offset,
loff_t count,
struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmbt_irec prev;
int shift = 0;
@@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
xfs_fileoff_t end_fsb;
- int error = 0, eof = 0;
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int whichfork = XFS_DATA_FORK;
+ int error = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
@@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
- got.br_startoff = end_fsb; /* fake hole until the end */
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
- if (got.br_startoff <= offset_fsb) {
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ whichfork = XFS_COW_FORK;
+ goto done;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
/*
* For reflink files we may need a delalloc reservation when
* overwriting shared extents. This includes zeroing of
* existing extents that contain data.
*/
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- got.br_state != XFS_EXT_UNWRITTEN)) {
- xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got);
- if (error)
- goto out_unlock;
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
}
- trace_xfs_iomap_found(ip, offset, count, 0, &got);
- goto done;
- }
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
- goto out_unlock;
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ whichfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (xfs_is_always_cow_inode(ip))
+ whichfork = XFS_COW_FORK;
}
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
- * to keep the chunks of work done where somewhat symmetric with the
- * work writeback does. This is a completely arbitrary number pulled
- * out of thin air as a best guess for initial testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
- &icur);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+ count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay(
}
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
- eof);
+ error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap,
+ whichfork == XFS_DATA_FORK ? &icur : &ccur,
+ whichfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
@@ -647,186 +711,22 @@ retry:
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap);
done:
- if (isnullstartblock(got.br_startblock))
- got.br_startblock = DELAYSTARTBLOCK;
-
- if (!got.br_startblock) {
- error = xfs_alert_fsblock_zero(ip, &got);
- if (error)
+ if (whichfork == XFS_COW_FORK) {
+ if (imap.br_startoff > offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
goto out_unlock;
- }
-
- xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- int whichfork,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- unsigned int *cow_seq)
-{
- xfs_mount_t *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_filblks_t count_fsb;
- xfs_trans_t *tp;
- int nimaps;
- int error = 0;
- int flags = XFS_BMAPI_DELALLOC;
- int nres;
-
- if (whichfork == XFS_COW_FORK)
- flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
- /*
- * Make sure that the dquots are there.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = imap->br_blockcount;
- map_start_fsb = imap->br_startoff;
-
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
- nimaps = 0;
- while (nimaps == 0) {
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- /*
- * We have already reserved space for the extent and any
- * indirect blocks when creating the delalloc extent,
- * there is no need to reserve space in this transaction
- * again.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * it is possible that the extents have changed since
- * we did the read call as we dropped the ilock for a
- * while. We have to be careful about truncates or hole
- * punchs here - we are not allowed to allocate
- * non-delalloc blocks here.
- *
- * The only protection against truncation is the pages
- * for the range we are being asked to convert are
- * locked and hence a truncate will block on them
- * first.
- *
- * As a result, if we go beyond the range we really
- * need and hit an delalloc extent boundary followed by
- * a hole while we have excess blocks in the map, we
- * will fill the hole incorrectly and overrun the
- * transaction reservation.
- *
- * Using a single map prevents this as we are forced to
- * check each map we look for overlap with the desired
- * range and abort as soon as we find it. Also, given
- * that we only return a single map, having one beyond
- * what we can return is probably a bit silly.
- *
- * We also need to check that we don't go beyond EOF;
- * this is a truncate optimisation as a truncate sets
- * the new file size before block on the pages we
- * currently have locked under writeback. Because they
- * are about to be tossed, we don't need to write them
- * back....
- */
- nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(ip, &last_block,
- XFS_DATA_FORK);
- if (error)
- goto trans_cancel;
-
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = -EAGAIN;
- goto trans_cancel;
- }
- }
-
- /*
- * From this point onwards we overwrite the imap
- * pointer that the caller gave to us.
- */
- error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, flags, nres, imap,
- &nimaps);
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
-
- if (whichfork == XFS_COW_FORK)
- *cow_seq = READ_ONCE(ifp->if_seq);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, imap);
-
- if ((offset_fsb >= imap->br_startoff) &&
- (offset_fsb < (imap->br_startoff +
- imap->br_blockcount))) {
- XFS_STATS_INC(mp, xs_xstrat_quick);
- return 0;
}
-
- /*
- * So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- count_fsb -= imap->br_blockcount;
- map_start_fsb = imap->br_startoff + imap->br_blockcount;
+ /* ensure we only report blocks we have a reservation for */
+ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+ shared = true;
}
-
-trans_cancel:
- xfs_trans_cancel(tp);
-error0:
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -975,7 +875,7 @@ xfs_ilock_for_iomap(
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && is_write) {
+ if (xfs_is_cow_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
@@ -1009,7 +909,7 @@ relock:
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
@@ -1081,23 +981,33 @@ xfs_file_iomap_begin(
* Break shared extents if necessary. Checks for non-blocking IO have
* been done up front, so we don't need to do them here.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
+ struct xfs_bmbt_irec cmap;
+ bool directio = (flags & IOMAP_DIRECT);
+
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
!needs_cow_for_zeroing(&imap, nimaps))
goto out_found;
- if (flags & IOMAP_DIRECT) {
- /* may drop and re-acquire the ilock */
- error = xfs_reflink_allocate_cow(ip, &imap, &shared,
- &lockmode);
- if (error)
- goto out_unlock;
- } else {
- error = xfs_reflink_reserve_cow(ip, &imap);
- if (error)
- goto out_unlock;
- }
+ /* may drop and re-acquire the ilock */
+ cmap = imap;
+ error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+ directio);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For buffered writes we need to report the address of the
+ * previous block (if there was any) so that the higher level
+ * write code can perform read-modify-write operations; we
+ * won't need the CoW fork mapping until writeback. For direct
+ * I/O, which must be block aligned, we need to report the
+ * newly allocated address. If the data fork has a hole, copy
+ * the COW fork mapping to avoid allocating to the data fork.
+ */
+ if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ imap = cmap;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1049,15 @@ xfs_file_iomap_begin(
return error;
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
- & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
-
- xfs_bmbt_to_iomap(ip, iomap, &imap);
-
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
- return 0;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
goto out_finish;
out_unlock:
@@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = {
};
static int
+xfs_seek_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+ /*
+ * If we found a data extent we are done.
+ */
+ if (imap.br_startoff <= offset_fsb)
+ goto done;
+ data_fsb = imap.br_startoff;
+ } else {
+ /*
+ * Fake a hole until the end of the file.
+ */
+ data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ }
+
+ /*
+ * If a COW fork extent covers the hole, report it - capped to the next
+ * data fork extent:
+ */
+ if (xfs_inode_has_cow_data(ip) &&
+ xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cow_fsb = cmap.br_startoff;
+ if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+ * by this extent.
+ */
+ iomap->type = IOMAP_UNWRITTEN;
+ goto out_unlock;
+ }
+
+ /*
+ * Else report a hole, capped to the next found data or COW extent.
+ */
+ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+ imap.br_blockcount = cow_fsb - offset_fsb;
+ else
+ imap.br_blockcount = data_fsb - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+done:
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+ .iomap_begin = xfs_seek_iomap_begin,
+};
+
+static int
xfs_xattr_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin(
out_unlock:
xfs_iunlock(ip, lockmode);
- if (!error) {
- ASSERT(nimaps);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- }
-
- return error;
+ if (error)
+ return error;
+ ASSERT(nimaps);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c6170548831b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
- struct xfs_bmbt_irec *, unsigned int *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *, bool shared);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7a8d3e..74047bd0c1ae 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -191,9 +191,18 @@ xfs_generic_create(
xfs_setup_iops(ip);
- if (tmpfile)
+ if (tmpfile) {
+ /*
+ * The VFS requires that any inode fed to d_tmpfile must have
+ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+ * However, we created the temp file with nlink == 0 because
+ * we're not allowed to put an inode with nlink > 0 on the
+ * unlinked list. Therefore we have to set nlink to 1 so that
+ * d_tmpfile can immediately set it back to zero.
+ */
+ set_nlink(inode, 1);
d_tmpfile(dentry, inode);
- else
+ } else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
@@ -522,6 +531,10 @@ xfs_vn_getattr(
}
}
+ /*
+ * Note: If you add another clause to set an attribute flag, please
+ * update attributes_mask below.
+ */
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +542,10 @@ xfs_vn_getattr(
if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND |
+ STATX_ATTR_NODUMP);
+
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9fe88d125f0a..3371d1ff27c4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_BTREE_BUF:
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_allocbt_buf_ops;
+ bp->b_ops = &xfs_cntbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
- case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
- case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
@@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b4d8c318be3c..fd63b0b1307c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -149,6 +149,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +228,9 @@ xfs_initialize_perag(
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +253,7 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7daafe064af8..110f927cf943 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -138,7 +138,7 @@ typedef struct xfs_mount {
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint64_t m_flags; /* global mount flags */
- bool m_inotbt_nores; /* no per-AG finobt resv. */
+ bool m_finobt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode
@@ -194,6 +194,7 @@ typedef struct xfs_mount {
*/
uint32_t m_generation;
+ bool m_always_cow;
bool m_fail_unmount;
#ifdef DEBUG
/*
@@ -396,6 +397,13 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d20d8d4..c8ba98fae30a 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+
+ /*
+ * The v5 superblock format extended several v4 header structures with
+ * additional data. While new fields are only accessible on v5
+ * superblocks, it's important that the v5 structures place original v4
+ * fields/headers in the correct location on-disk. For example, we must
+ * be able to find magic values at the same location in certain blocks
+ * regardless of superblock version.
+ *
+ * The following checks ensure that various v5 data structures place the
+ * subset of v4 metadata associated with the same type of block at the
+ * start of the on-disk block. If there is no data structure definition
+ * for certain types of v4 blocks, traverse down to the first field of
+ * common metadata (e.g., magic value) and make sure it is at offset
+ * zero.
+ */
+ XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..bde2c9f56a46 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -185,7 +185,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c5b4fa004ca4..680ae7662a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared(
}
}
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got;
- int error = 0;
- bool eof = false;
- struct xfs_iext_cursor icur;
- bool shared;
-
- /*
- * Search the COW fork extent list first. This serves two purposes:
- * first this implement the speculative preallocation using cowextisze,
- * so that we also unshared block adjacent to shared blocks instead
- * of just the shared blocks themselves. Second the lookup in the
- * extent list is generally faster than going out to the shared extent
- * tree.
- */
-
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
- eof = true;
- if (!eof && got.br_startoff <= imap->br_startoff) {
- trace_xfs_reflink_cow_found(ip, imap);
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ /* We can't update any real extents in always COW mode. */
+ if (xfs_is_always_cow_inode(ip) &&
+ !isnullstartblock(imap->br_startblock)) {
+ *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, &shared);
- if (error)
- return error;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared)
- return 0;
-
- /*
- * Fork all the shared blocks from our write offset until the end of
- * the extent.
- */
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- return error;
-
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &icur, eof);
- if (error == -ENOSPC || error == -EDQUOT)
- trace_xfs_reflink_cow_enospc(ip, imap);
- if (error)
- return error;
-
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- trace_xfs_reflink_cow_alloc(ip, &got);
- return 0;
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- xfs_fileoff_t offset_fsb,
- xfs_filblks_t count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb)
{
- int nimaps = 1;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_btree_cur *dummy_cur = NULL;
+ int dummy_logflags;
+ int error = 0;
- if (imap->br_state == XFS_EXT_NORM)
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
- xfs_trim_extent(imap, offset_fsb, count_fsb);
- trace_xfs_reflink_convert_cow(ip, imap);
- if (imap->br_blockcount == 0)
- return 0;
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
- &nimaps);
+ do {
+ if (got.br_startoff >= offset_fsb + count_fsb)
+ break;
+ if (got.br_state == XFS_EXT_NORM)
+ continue;
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+ return -EIO;
+
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ if (!got.br_blockcount)
+ continue;
+
+ got.br_state = XFS_EXT_NORM;
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
+ &dummy_logflags);
+ if (error)
+ return error;
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+ return error;
}
/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +300,12 @@ xfs_reflink_convert_cow(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
- struct xfs_bmbt_irec imap;
- int nimaps = 1, error = 0;
+ int error;
ASSERT(count != 0);
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -375,7 +338,7 @@ xfs_find_trim_cow_extent(
if (got.br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
got.br_startoff - imap->br_startoff);
- return xfs_reflink_trim_around_shared(ip, imap, shared);
+ return xfs_inode_need_cow(ip, imap, shared);
}
*shared = true;
@@ -397,7 +360,8 @@ xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared,
- uint *lockmode)
+ uint *lockmode,
+ bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -409,7 +373,10 @@ xfs_reflink_allocate_cow(
xfs_extlen_t resblks = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
@@ -471,7 +438,16 @@ xfs_reflink_allocate_cow(
if (nimaps == 0)
return -ENOSPC;
convert:
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+ xfs_trim_extent(imap, offset_fsb, count_fsb);
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ return 0;
+ trace_xfs_reflink_convert_cow(ip, imap);
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range(
int error;
trace_xfs_reflink_cancel_cow_range(ip, offset, count);
- ASSERT(xfs_is_reflink_inode(ip));
+ ASSERT(ip->i_cowfp);
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (count == NULLFILEOFF)
@@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks(
break;
ASSERT(nimaps == 1);
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
&imap);
/* Translate imap into the destination file. */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..28a43b7f581d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow &&
+ xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ bool *shared);
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+ struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+ bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c9097cb0b955..f093ea244849 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1594,6 +1594,13 @@ xfs_mount_alloc(
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
return mp;
}
@@ -1729,11 +1736,18 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
"reflink not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
+ if (xfs_globals.always_cow) {
+ xfs_info(mp, "using DEBUG-only always_cow mode.");
+ mp->m_always_cow = true;
+ }
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..ad7f9be13087 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,7 @@ struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
+ bool always_cow; /* use COW fork for all overwrites */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..cabda13f3c64 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -183,10 +183,34 @@ mount_delay_show(
}
XFS_SYSFS_ATTR_RW(mount_delay);
+static ssize_t
+always_cow_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.always_cow);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static ssize_t
+always_cow_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
+ ATTR_LIST(always_cow),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..47fb07d86efd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \
DEFINE_READPAGE_EVENT(xfs_vm_readpage);
DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-TRACE_DEFINE_ENUM(XFS_IO_HOLE);
-TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
-TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
-TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
-TRACE_DEFINE_ENUM(XFS_IO_COW);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int type, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, type, irec),
+ int whichfork, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, type)
+ __field(int, whichfork)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->type = type;
+ __entry->whichfork = whichfork;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
)
-#define DEFINE_IOMAP_EVENT(name) \
+#define DEFINE_IMAP_EVENT(name) \
DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int type, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+ int whichfork, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
TRACE_EVENT(xfs_reflink_remap_blocks_loop,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bucket = bucket;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno %u agino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 11cff449d055..e1c7d55b32c3 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -17,7 +17,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_inode.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate a "bmap update done"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 629f1479c9d2..7d65ebf1e847 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -277,7 +277,7 @@ xfs_trans_read_buf_map(
* release this buffer when it kills the tranaction.
*/
ASSERT(bp->b_ops != NULL);
- error = xfs_buf_ensure_ops(bp, ops);
+ error = xfs_buf_reverify(bp, ops);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 0710434eb240..8ee7a3f8bb20 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -18,7 +18,6 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_trace.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate an "extent free done"
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 6c947ff4faf6..8d734728dd1b 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -16,7 +16,6 @@
#include "xfs_refcount_item.h"
#include "xfs_alloc.h"
#include "xfs_refcount.h"
-#include "xfs_defer.h"
/*
* This routine is called to allocate a "refcount update done"
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index a42890931ecd..5c7936b1be13 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -16,7 +16,6 @@
#include "xfs_rmap_item.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "xfs_defer.h"
/* Set the map extent flags for this reverse mapping. */
static void
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5bf1d7..9a63016009a1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -129,6 +129,9 @@ __xfs_xattr_put_listent(
char *offset;
int arraytop;
+ if (context->count < 0 || context->seen_enough)
+ return;
+
if (!context->alist)
goto compute_size;