aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_dir.c19
-rw-r--r--fs/9p/vfs_file.c24
-rw-r--r--fs/aio.c8
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/btrfs/backref.c39
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/check-integrity.c6
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.c85
-rw-r--r--fs/btrfs/ctree.h56
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-inode.h4
-rw-r--r--fs/btrfs/delayed-ref.c107
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c64
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/disk-io.c24
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c461
-rw-r--r--fs/btrfs/extent_io.c45
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c32
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file.c45
-rw-r--r--fs/btrfs/free-space-cache.c48
-rw-r--r--fs/btrfs/inode.c147
-rw-r--r--fs/btrfs/ioctl.c18
-rw-r--r--fs/btrfs/qgroup.c455
-rw-r--r--fs/btrfs/qgroup.h8
-rw-r--r--fs/btrfs/ref-verify.c8
-rw-r--r--fs/btrfs/relocation.c74
-rw-r--r--fs/btrfs/scrub.c34
-rw-r--r--fs/btrfs/send.c24
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tests/extent-io-tests.c10
-rw-r--r--fs/btrfs/tests/extent-map-tests.c4
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/tree-log.c91
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c117
-rw-r--r--fs/btrfs/volumes.h9
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/ceph/acl.c13
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/file.c573
-rw-r--r--fs/ceph/inode.c13
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/cifs/cifs_debug.c17
-rw-r--r--fs/cifs/cifs_debug.h28
-rw-r--r--fs/cifs/cifs_dfs_ref.c7
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_ioctl.h11
-rw-r--r--fs/cifs/cifsfs.c30
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h14
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/cifssmb.c23
-rw-r--r--fs/cifs/connect.c13
-rw-r--r--fs/cifs/file.c56
-rw-r--r--fs/cifs/inode.c73
-rw-r--r--fs/cifs/ioctl.c48
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2inode.c332
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2ops.c212
-rw-r--r--fs/cifs/smb2pdu.c260
-rw-r--r--fs/cifs/smb2pdu.h13
-rw-r--r--fs/cifs/smb2proto.h28
-rw-r--r--fs/cifs/smbdirect.c38
-rw-r--r--fs/cifs/trace.h109
-rw-r--r--fs/cifs/transport.c78
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/compat_ioctl.c369
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/cramfs/inode.c12
-rw-r--r--fs/crypto/fscrypt_private.h4
-rw-r--r--fs/crypto/keyinfo.c10
-rw-r--r--fs/dax.c917
-rw-r--r--fs/dcache.c40
-rw-r--r--fs/ecryptfs/inode.c11
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/ext2.h4
-rw-r--r--fs/ext2/super.c5
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/ext4.h26
-rw-r--r--fs/ext4/ext4_extents.h13
-rw-r--r--fs/ext4/extents.c595
-rw-r--r--fs/ext4/extents_status.c654
-rw-r--r--fs/ext4/extents_status.h80
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c144
-rw-r--r--fs/ext4/ioctl.c97
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/super.c81
-rw-r--r--fs/f2fs/acl.c12
-rw-r--r--fs/f2fs/acl.h5
-rw-r--r--fs/f2fs/checkpoint.c94
-rw-r--r--fs/f2fs/data.c226
-rw-r--r--fs/f2fs/debug.c35
-rw-r--r--fs/f2fs/dir.c32
-rw-r--r--fs/f2fs/extent_cache.c134
-rw-r--r--fs/f2fs/f2fs.h255
-rw-r--r--fs/f2fs/file.c196
-rw-r--r--fs/f2fs/gc.c115
-rw-r--r--fs/f2fs/gc.h5
-rw-r--r--fs/f2fs/hash.c5
-rw-r--r--fs/f2fs/inline.c10
-rw-r--r--fs/f2fs/inode.c28
-rw-r--r--fs/f2fs/namei.c57
-rw-r--r--fs/f2fs/node.c89
-rw-r--r--fs/f2fs/node.h5
-rw-r--r--fs/f2fs/recovery.c125
-rw-r--r--fs/f2fs/segment.c240
-rw-r--r--fs/f2fs/segment.h20
-rw-r--r--fs/f2fs/shrinker.c5
-rw-r--r--fs/f2fs/super.c415
-rw-r--r--fs/f2fs/sysfs.c17
-rw-r--r--fs/f2fs/trace.c5
-rw-r--r--fs/f2fs/trace.h5
-rw-r--r--fs/f2fs/xattr.c5
-rw-r--r--fs/f2fs/xattr.h5
-rw-r--r--fs/fat/dir.c6
-rw-r--r--fs/fat/fat.h4
-rw-r--r--fs/fat/file.c17
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fat/misc.c91
-rw-r--r--fs/fat/namei_msdos.c17
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fs-writeback.c25
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/control.c34
-rw-r--r--fs/fuse/dev.c221
-rw-r--r--fs/fuse/dir.c381
-rw-r--r--fs/fuse/file.c158
-rw-r--r--fs/fuse/fuse_i.h124
-rw-r--r--fs/fuse/inode.c53
-rw-r--r--fs/fuse/readdir.c569
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/dir.c28
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/glock.c17
-rw-r--r--fs/gfs2/incore.h9
-rw-r--r--fs/gfs2/lock_dlm.c10
-rw-r--r--fs/gfs2/log.c11
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/ops_fstype.c5
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/rgrp.c201
-rw-r--r--fs/gfs2/rgrp.h11
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/trans.c15
-rw-r--r--fs/gfs2/util.c16
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/gfs2/xattr.c18
-rw-r--r--fs/hfs/brec.c5
-rw-r--r--fs/hfs/btree.c41
-rw-r--r--fs/hfs/btree.h1
-rw-r--r--fs/hfs/catalog.c16
-rw-r--r--fs/hfs/extent.c10
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/attributes.c10
-rw-r--r--fs/hfsplus/brec.c5
-rw-r--r--fs/hfsplus/btree.c44
-rw-r--r--fs/hfsplus/catalog.c24
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c1
-rw-r--r--fs/inode.c4
-rw-r--r--fs/ioctl.c3
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/jbd2/checkpoint.c4
-rw-r--r--fs/jffs2/background.c2
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/mount.c3
-rw-r--r--fs/kernfs/symlink.c5
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/dns_resolve.c15
-rw-r--r--fs/nfsd/cache.h20
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/netns.h8
-rw-r--r--fs/nfsd/nfs4callback.c98
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4proc.c289
-rw-r--r--fs/nfsd/nfs4state.c41
-rw-r--r--fs/nfsd/nfs4xdr.c50
-rw-r--r--fs/nfsd/nfscache.c142
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/state.h10
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nfsd/xdr4.h28
-rw-r--r--fs/nfsd/xdr4cb.h10
-rw-r--r--fs/nilfs2/btnode.c26
-rw-r--r--fs/nilfs2/page.c29
-rw-r--r--fs/notify/fanotify/fanotify.c17
-rw-r--r--fs/notify/fanotify/fanotify.h4
-rw-r--r--fs/notify/fanotify/fanotify_user.c103
-rw-r--r--fs/notify/fdinfo.c29
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/notify/mark.c43
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/aops.c3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmthread.c2
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/orangefs/acl.c4
-rw-r--r--fs/orangefs/inode.c8
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c2
-rw-r--r--fs/overlayfs/copy_up.c213
-rw-r--r--fs/overlayfs/dir.c34
-rw-r--r--fs/overlayfs/inode.c17
-rw-r--r--fs/overlayfs/namei.c4
-rw-r--r--fs/overlayfs/overlayfs.h14
-rw-r--r--fs/overlayfs/super.c68
-rw-r--r--fs/overlayfs/util.c46
-rw-r--r--fs/proc/base.c18
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/loadavg.c3
-rw-r--r--fs/proc/meminfo.c16
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/proc/vmcore.c38
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/internal.h5
-rw-r--r--fs/pstore/platform.c75
-rw-r--r--fs/pstore/ram.c18
-rw-r--r--fs/pstore/ram_core.c11
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/Makefile9
-rw-r--r--fs/reiserfs/xattr.c7
-rw-r--r--fs/select.c20
-rw-r--r--fs/signalfd.c6
-rw-r--r--fs/stat.c3
-rw-r--r--fs/super.c2
-rw-r--r--fs/timerfd.c12
-rw-r--r--fs/udf/balloc.c30
-rw-r--r--fs/udf/super.c139
-rw-r--r--fs/udf/udf_sb.h10
-rw-r--r--fs/userfaultfd.c8
-rw-r--r--fs/utimes.c73
-rw-r--r--fs/xfs/libxfs/xfs_attr.c236
-rw-r--r--fs/xfs/libxfs/xfs_attr.h (renamed from fs/xfs/xfs_attr.h)2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c70
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h1
-rw-r--r--fs/xfs/libxfs/xfs_format.h8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c5
-rw-r--r--fs/xfs/scrub/repair.c128
-rw-r--r--fs/xfs/scrub/scrub.c13
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_aops.h14
-rw-r--r--fs/xfs/xfs_bmap_util.c61
-rw-r--r--fs/xfs/xfs_buf.c109
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_fsops.c50
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c53
-rw-r--r--fs/xfs/xfs_reflink.c33
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_stats.c52
-rw-r--r--fs/xfs/xfs_stats.h28
-rw-r--r--fs/xfs/xfs_super.c38
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c28
-rw-r--r--fs/xfs/xfs_trans_buf.c42
292 files changed, 9767 insertions, 5135 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227fa56b..6261719f6f2a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
- struct iattr iattr;
+ struct iattr iattr = { 0 };
struct posix_acl *old_acl = acl;
retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 89bac3d2f05b..619128b55837 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -61,6 +61,8 @@ enum {
Opt_cache_loose, Opt_fscache, Opt_mmap,
/* Access options */
Opt_access, Opt_posixacl,
+ /* Lock timeout option */
+ Opt_locktimeout,
/* Error token */
Opt_err
};
@@ -80,6 +82,7 @@ static const match_table_t tokens = {
{Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_posixacl, "posixacl"},
+ {Opt_locktimeout, "locktimeout=%u"},
{Opt_err, NULL}
};
@@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#ifdef CONFIG_9P_FSCACHE
v9ses->cachetag = NULL;
#endif
+ v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
if (!opts)
return 0;
@@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
#endif
break;
+ case Opt_locktimeout:
+ r = match_int(&args[0], &option);
+ if (r < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "integer field, but no integer?\n");
+ ret = r;
+ continue;
+ }
+ if (option < 1) {
+ p9_debug(P9_DEBUG_ERROR,
+ "locktimeout must be a greater than zero integer.\n");
+ ret = -EINVAL;
+ continue;
+ }
+ v9ses->session_lock_timeout = (long)option * HZ;
+ break;
+
default:
continue;
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 982e017acadb..129e5243a6bf 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,7 @@ struct v9fs_session_info {
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
struct rw_semaphore rename_sem;
+ long session_lock_timeout; /* retry interval for blocking locks */
};
/* cache_validity flags */
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d5db3c968a03..00745147329d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat)
return rettype;
}
-static void p9stat_init(struct p9_wstat *stbuf)
-{
- stbuf->name = NULL;
- stbuf->uid = NULL;
- stbuf->gid = NULL;
- stbuf->muid = NULL;
- stbuf->extension = NULL;
-}
-
/**
* v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir
* @filp: opened file structure
@@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int err = 0;
struct p9_fid *fid;
int buflen;
- int reclen = 0;
struct p9_rdir *rdir;
struct kvec kvec;
@@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
rdir->tail = n;
}
while (rdir->head < rdir->tail) {
- p9stat_init(&st);
err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
rdir->tail - rdir->head, &st);
- if (err) {
+ if (err <= 0) {
p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
- p9stat_free(&st);
return -EIO;
}
- reclen = st.size+2;
over = !dir_emit(ctx, st.name, strlen(st.name),
v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
if (over)
return 0;
- rdir->head += reclen;
- ctx->pos += reclen;
+ rdir->head += err;
+ ctx->pos += err;
}
}
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5f2e48d41d72..a25efa782fcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
uint8_t status = P9_LOCK_ERROR;
int res = 0;
unsigned char fl_type;
+ struct v9fs_session_info *v9ses;
fid = filp->private_data;
BUG_ON(fid == NULL);
@@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if (IS_SETLKW(cmd))
flock.flags = P9_LOCK_FLAGS_BLOCK;
+ v9ses = v9fs_inode2v9ses(file_inode(filp));
+
/*
* if its a blocked request and we get P9_LOCK_BLOCKED as the status
* for lock request, keep on trying
@@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
break;
- if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+ if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+ != 0)
break;
+ /*
+ * p9_client_lock_dotl overwrites flock.client_id with the
+ * server message, free and reuse the client name
+ */
+ if (flock.client_id != fid->clnt->name) {
+ kfree(flock.client_id);
+ flock.client_id = fid->clnt->name;
+ }
}
/* map 9p status to VFS status */
@@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
break;
default:
WARN_ONCE(1, "unknown lock status code: %d\n", status);
- /* fallthough */
+ /* fall through */
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
@@ -235,6 +247,8 @@ out_unlock:
locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
+ if (flock.client_id != fid->clnt->name)
+ kfree(flock.client_id);
out:
return res;
}
@@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
res = p9_client_getlock_dotl(fid, &glock);
if (res < 0)
- return res;
+ goto out;
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
@@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = glock.start + glock.length - 1;
fl->fl_pid = -glock.proc_id;
}
- kfree(glock.client_id);
+out:
+ if (glock.client_id != fid->clnt->name)
+ kfree(glock.client_id);
return res;
}
diff --git a/fs/aio.c b/fs/aio.c
index b9350f3360c6..301e6314183b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2135,12 +2135,12 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
compat_long_t, min_nr,
compat_long_t, nr,
struct io_event __user *, events,
- struct compat_timespec __user *, timeout)
+ struct old_timespec32 __user *, timeout)
{
struct timespec64 t;
int ret;
- if (timeout && compat_get_timespec64(&t, timeout))
+ if (timeout && get_old_timespec32(&t, timeout))
return -EFAULT;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
@@ -2160,7 +2160,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
compat_long_t, min_nr,
compat_long_t, nr,
struct io_event __user *, events,
- struct compat_timespec __user *, timeout,
+ struct old_timespec32 __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
struct __compat_aio_sigset ksig = { NULL, };
@@ -2168,7 +2168,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct timespec64 t;
int ret;
- if (timeout && compat_get_timespec64(&t, timeout))
+ if (timeout && get_old_timespec32(&t, timeout))
return -EFAULT;
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index efae2fb0930a..54207327f98f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1580,7 +1580,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
- const siginfo_t *siginfo)
+ const kernel_siginfo_t *siginfo)
{
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
@@ -1782,7 +1782,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const siginfo_t *siginfo, struct pt_regs *regs)
+ const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -2031,7 +2031,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const siginfo_t *siginfo, struct pt_regs *regs)
+ const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
struct list_head *t;
struct core_thread *ct;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1574a2..68ebe188446a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
}
struct preftree {
- struct rb_root root;
+ struct rb_root_cached root;
unsigned int count;
};
-#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 }
+#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
struct preftrees {
struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
struct prelim_ref *newref,
struct share_check *sc)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct rb_node **p;
struct rb_node *parent = NULL;
struct prelim_ref *ref;
int result;
+ bool leftmost = true;
root = &preftree->root;
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
while (*p) {
parent = *p;
@@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
p = &(*p)->rb_left;
} else if (result > 0) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
/* Identical refs, merge them and free @newref */
struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
- rb_insert_color(&newref->rbnode, root);
+ rb_insert_color_cached(&newref->rbnode, root, leftmost);
}
/*
@@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree)
{
struct prelim_ref *ref, *next_ref;
- rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
- rbnode)
+ rbtree_postorder_for_each_entry_safe(ref, next_ref,
+ &preftree->root.rb_root, rbnode)
free_pref(ref);
- preftree->root = RB_ROOT;
+ preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
}
@@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* freeing the entire indirect tree when we're done. In some test
* cases, the tree can grow quite large (~200k objects).
*/
- while ((rnode = rb_first(&preftrees->indirect.root))) {
+ while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
goto out;
}
- rb_erase(&ref->rbnode, &preftrees->indirect.root);
+ rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
preftrees->indirect.count--;
if (ref->count == 0) {
@@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct preftree *tree = &preftrees->indirect_missing_keys;
struct rb_node *node;
- while ((node = rb_first(&tree->root))) {
+ while ((node = rb_first_cached(&tree->root))) {
ref = rb_entry(node, struct prelim_ref, rbnode);
- rb_erase(node, &tree->root);
+ rb_erase_cached(node, &tree->root);
BUG_ON(ref->parent); /* should not be a direct ref */
BUG_ON(ref->key_for_search.type);
@@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
spin_lock(&head->lock);
- for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+ for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
if (node->seq > seq)
@@ -1229,14 +1231,14 @@ again:
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
extent_item_pos, total_refs, sc, ignore_offset);
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
/*
* This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1247,7 @@ again:
*
* We release the entire tree in one go before returning.
*/
- node = rb_first(&preftrees.direct.root);
+ node = rb_first_cached(&preftrees.direct.root);
while (node) {
ref = rb_entry(node, struct prelim_ref, rbnode);
node = rb_next(&ref->rbnode);
@@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
struct seq_list elem = SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
- .root_objectid = root->objectid,
+ .root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
};
@@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
- cur, found_key.objectid, fs_root->objectid);
+ cur, found_key.objectid,
+ fs_root->root_key.objectid);
ret = iterate(parent, name_len,
(unsigned long)(iref + 1), eb, ctx);
if (ret)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..97d91e55b70a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
{
- u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+ u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
#if BITS_PER_LONG == 32
h = (h >> 32) ^ (h & 0xffffffff);
@@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
/* Output minus objectid, which is more meaningful */
- if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 833cf3c35b4d..2e43fba44035 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
{
unsigned int num_pages;
unsigned int i;
+ size_t size;
u64 dev_bytenr;
int ret;
@@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
- block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) +
- sizeof(*block_ctx->pagev),
- num_pages, GFP_NOFS);
+ size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
+ block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
if (!block_ctx->mem_to_free)
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa66592aa7..2955a4ea2fa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
@@ -528,7 +526,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree;
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned long compressed_len;
@@ -545,7 +542,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int faili = 0;
u32 *sums;
- tree = &BTRFS_I(inode)->io_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d436fb4c002e..539901fb5165 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
}
}
-/*
- * reset all the locked nodes in the patch to spinning locks.
- *
- * held is used to keep lockdep happy, when lockdep is enabled
- * we set held to a blocking lock before we go around and
- * retake all the spinlocks in the path. You can safely use NULL
- * for held
- */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw)
-{
- int i;
-
- if (held) {
- btrfs_set_lock_blocking_rw(held, held_rw);
- if (held_rw == BTRFS_WRITE_LOCK)
- held_rw = BTRFS_WRITE_LOCK_BLOCKING;
- else if (held_rw == BTRFS_READ_LOCK)
- held_rw = BTRFS_READ_LOCK_BLOCKING;
- }
- btrfs_set_path_blocking(p);
-
- for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
- if (p->nodes[i] && p->locks[i]) {
- btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
- p->locks[i] = BTRFS_WRITE_LOCK;
- else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
- p->locks[i] = BTRFS_READ_LOCK;
- }
- }
-
- if (held)
- btrfs_clear_lock_blocking_rw(held, held_rw);
-}
-
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -1050,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
+ /*
+ * If we are COWing a node/leaf from the extent, chunk or device trees,
+ * make sure that we do not finish block group creation of pending block
+ * groups. We do this to avoid a deadlock.
+ * COWing can result in allocation of a new chunk, and flushing pending
+ * block groups (btrfs_create_pending_block_groups()) can be triggered
+ * when finishing allocation of a new chunk. Creation of a pending block
+ * group modifies the extent, chunk and device trees, therefore we could
+ * deadlock with ourselves since we are holding a lock on an extent
+ * buffer that btrfs_create_pending_block_groups() may try to COW later.
+ */
+ if (root == fs_info->extent_root ||
+ root == fs_info->chunk_root ||
+ root == fs_info->dev_root)
+ trans->can_flush_pending_bgs = false;
+
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size);
+ trans->can_flush_pending_bgs = true;
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1306,7 +1287,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
}
}
- btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1815,8 +1795,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
int orig_slot = path->slots[level];
u64 orig_ptr;
- if (level == 0)
- return 0;
+ ASSERT(level > 0);
mid = path->nodes[level];
@@ -2483,7 +2462,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(sret > 0);
if (sret) {
@@ -2504,7 +2482,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
if (sret) {
ret = sret;
@@ -2789,7 +2766,10 @@ again:
}
cow_done:
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
+ /*
+ * Leave path with blocking locks to avoid massive
+ * lock context switch, this is made on purpose.
+ */
/*
* we have a lock on b and as long as we aren't changing
@@ -2871,8 +2851,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_WRITE_LOCK);
}
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
@@ -2880,8 +2858,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -2900,7 +2876,6 @@ cow_done:
btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(err > 0);
if (err) {
@@ -2910,7 +2885,7 @@ cow_done:
}
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
+ min_write_lock_level, NULL);
goto done;
}
}
@@ -2961,13 +2936,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
level = btrfs_header_level(b);
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
/*
* we have a lock on b and as long as we aren't changing
@@ -3013,8 +2991,6 @@ again:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
@@ -5198,7 +5174,6 @@ find_next_key:
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1, 0, NULL);
- btrfs_clear_path_blocking(path, NULL, 0);
}
out:
path->keep_locks = keep_locks;
@@ -5783,8 +5758,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5820,8 +5793,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..68ca41dbbef3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define STATIC noinline
-#else
-#define STATIC static noinline
-#endif
-
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -367,11 +361,13 @@ struct btrfs_dev_replace {
struct mutex lock_finishing_cancel_unmount;
rwlock_t lock;
- atomic_t read_locks;
atomic_t blocking_readers;
wait_queue_head_t read_lock_wq;
struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
};
/* For raid type sysfs entries */
@@ -1094,9 +1090,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- struct percpu_counter bio_counter;
- wait_queue_head_t replace_wait;
-
struct semaphore uuid_tree_rescan_sem;
/* Used to reclaim the metadata space in the background. */
@@ -1202,18 +1195,12 @@ struct btrfs_root {
int last_log_commit;
pid_t log_start_pid;
- u64 objectid;
u64 last_trans;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
- u64 alloc_bytenr;
-#endif
-
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1286,6 +1273,10 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ u64 alloc_bytenr;
+#endif
};
struct btrfs_file_private {
@@ -2607,10 +2598,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2771,7 +2760,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
- int update_size);
+ bool update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
@@ -2877,8 +2866,6 @@ void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3021,8 +3008,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
int name_len, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@ -3180,8 +3166,8 @@ void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end, int create);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -3201,9 +3187,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
-#endif
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3716,18 +3699,19 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-#endif
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
- return 1;
-#endif
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
return 0;
}
+#endif
static inline void cond_wake_up(struct wait_queue_head *wq)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f51b509f2d9b..c669f250d4a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
- delayed_node->ins_root = RB_ROOT;
- delayed_node->del_root = RB_ROOT;
+ delayed_node->ins_root = RB_ROOT_CACHED;
+ delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
@@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node,
struct btrfs_key *key)
{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
NULL, NULL);
}
@@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_item *item;
int cmp;
+ bool leftmost = true;
if (action == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
@@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
root = &delayed_node->del_root;
else
BUG();
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
node = &ins->rb_node;
while (*p) {
@@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_node);
cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0)
+ if (cmp < 0) {
p = &(*p)->rb_right;
- else if (cmp > 0)
+ leftmost = false;
+ } else if (cmp > 0) {
p = &(*p)->rb_left;
- else
+ } else {
return -EEXIST;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
ins->delayed_node = delayed_node;
ins->ins_or_del = action;
@@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
@@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
else
root = &delayed_item->delayed_node->del_root;
- rb_erase(&delayed_item->rb_node, root);
+ rb_erase_cached(&delayed_item->rb_node, root);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->ins_root);
+ p = rb_first_cached(&delayed_node->ins_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->del_root);
+ p = rb_first_cached(&delayed_node->del_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -559,7 +562,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
* reserved space when starting a transaction. So no need to reserve
* qgroup space here.
*/
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
@@ -647,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
@@ -762,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
i++;
}
- /* reset all the locked nodes in the patch to spinning locks. */
- btrfs_clear_path_blocking(path, NULL, 0);
-
/* insert the keys of the items */
setup_items_for_insert(root, path, keys, data_size,
total_data_size, total_size, nitems);
@@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- name_len, name, delayed_node->root->objectid,
+ name_len, name, delayed_node->root->root_key.objectid,
delayed_node->inode_id, ret);
BUG();
}
@@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->objectid, node->inode_id, ret);
+ index, node->root->root_key.objectid,
+ node->inode_id, ret);
BUG();
}
mutex_unlock(&node->mutex);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 33536cd681d4..74ae226ffaf0 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -50,8 +50,8 @@ struct btrfs_delayed_node {
* is waiting to be dealt with by the async worker.
*/
struct list_head p_list;
- struct rb_root ins_root;
- struct rb_root del_root;
+ struct rb_root_cached ins_root;
+ struct rb_root_cached del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545ba1f7..9301b3ad9217 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
}
/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
struct rb_node *node)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_head *entry;
struct btrfs_delayed_ref_head *ins;
u64 bytenr;
+ bool leftmost = true;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
bytenr = ins->bytenr;
@@ -117,26 +118,29 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
-static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_node *entry;
+ bool leftmost = true;
while (*p) {
int comp;
@@ -145,29 +149,46 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
ref_node);
comp = comp_refs(ins, entry, true);
- if (comp < 0)
+ if (comp < 0) {
p = &(*p)->rb_left;
- else if (comp > 0)
+ } else if (comp > 0) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
+static struct btrfs_delayed_ref_head *find_first_ref_head(
+ struct btrfs_delayed_ref_root *dr)
+{
+ struct rb_node *n;
+ struct btrfs_delayed_ref_head *entry;
+
+ n = rb_first_cached(&dr->href_root);
+ if (!n)
+ return NULL;
+
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+
+ return entry;
+}
+
/*
- * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot.
- * If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
+ * Find a head entry based on bytenr. This returns the delayed ref head if it
+ * was able to find one, or NULL if nothing was in that spot. If return_bigger
+ * is given, the next bigger entry is returned if no exact match is found.
*/
-static struct btrfs_delayed_ref_head *
-find_ref_head(struct rb_root *root, u64 bytenr,
- int return_bigger)
+static struct btrfs_delayed_ref_head *find_ref_head(
+ struct btrfs_delayed_ref_root *dr, u64 bytenr,
+ bool return_bigger)
{
+ struct rb_root *root = &dr->href_root.rb_root;
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
@@ -187,22 +208,18 @@ find_ref_head(struct rb_root *root, u64 bytenr,
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
- n = rb_first(root);
+ return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
- return entry;
}
return entry;
}
return NULL;
}
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- struct btrfs_delayed_ref_root *delayed_refs;
-
- delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;
@@ -227,7 +244,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref)
{
lockdep_assert_held(&head->lock);
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -296,7 +313,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
lockdep_assert_held(&head->lock);
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return;
/* We don't have too many refs to merge for data. */
@@ -314,7 +331,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&fs_info->tree_mod_seq_lock);
again:
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
@@ -345,40 +363,29 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
return ret;
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs)
{
- struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- u64 start;
- bool loop = false;
-
- delayed_refs = &trans->transaction->delayed_refs;
again:
- start = delayed_refs->run_delayed_start;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
- if (!head && !loop) {
+ head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
+ true);
+ if (!head && delayed_refs->run_delayed_start != 0) {
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
- if (!head)
- return NULL;
- } else if (!head && loop) {
- return NULL;
+ head = find_first_ref_head(delayed_refs);
}
+ if (!head)
+ return NULL;
while (head->processing) {
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
- if (loop)
+ if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
- start = 0;
- loop = true;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -569,7 +576,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
- head_ref->ref_tree = RB_ROOT;
+ head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
@@ -903,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
- return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ return find_ref_head(delayed_refs, bytenr, false);
}
void __cold btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d9f2a4ebd5db..8e20c5cb5404 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_tree;
+ struct rb_root_cached ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
@@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_root {
/* head ref rbtree */
- struct rb_root href_root;
+ struct rb_root_cached href_root;
/* dirty extent records */
struct rb_root dirty_extent_root;
@@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
@@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dec01970d8c5..2aa48aecc52b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -382,14 +382,6 @@ out:
return ret;
}
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-
- dev_replace->committed_cursor_left =
- dev_replace->cursor_left_last_write_of_item;
-}
-
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
@@ -408,11 +400,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
+ bool need_unlock;
- ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
- srcdev_name, &src_device);
- if (ret)
- return ret;
+ src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
+ srcdev_name);
+ if (IS_ERR(src_device))
+ return PTR_ERR(src_device);
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
@@ -432,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -440,6 +434,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
@@ -470,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
btrfs_dev_replace_write_unlock(dev_replace);
+ need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -481,7 +477,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
goto leave;
}
@@ -503,9 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
- btrfs_dev_replace_write_unlock(dev_replace);
+ if (need_unlock)
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wait_event(fs_info->replace_wait, !percpu_counter_sum(
- &fs_info->bio_counter));
+ wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
+ &fs_info->dev_replace.bio_counter));
}
/*
@@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->dev_replace.replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -961,13 +961,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
{
read_lock(&dev_replace->lock);
- atomic_inc(&dev_replace->read_locks);
}
void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- atomic_dec(&dev_replace->read_locks);
read_unlock(&dev_replace->lock);
}
@@ -985,7 +982,6 @@ again:
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
write_unlock(&dev_replace->lock);
}
@@ -994,45 +990,31 @@ void btrfs_dev_replace_set_lock_blocking(
struct btrfs_dev_replace *dev_replace)
{
/* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
atomic_inc(&dev_replace->blocking_readers);
read_unlock(&dev_replace->lock);
}
-/* acquire read lock and dec blocking cnt */
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace)
-{
- /* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
- read_lock(&dev_replace->lock);
- /* Barrier implied by atomic_dec_and_test */
- if (atomic_dec_and_test(&dev_replace->blocking_readers))
- cond_wake_up_nomb(&dev_replace->read_lock_wq);
-}
-
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
- percpu_counter_sub(&fs_info->bio_counter, amount);
- cond_wake_up_nomb(&fs_info->replace_wait);
+ percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
+ cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
while (1) {
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state)))
break;
btrfs_bio_counter_dec(fs_info);
- wait_event(fs_info->replace_wait,
+ wait_event(fs_info->dev_replace.replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index b6d4206188bb..795c551f5b5e 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
@@ -28,12 +27,5 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace);
-
-static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
-{
- atomic64_inc(stat_value);
-}
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a678b07fcf01..8de74d835dba 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, const char *name, int name_len,
- struct btrfs_inode *dir, struct btrfs_key *location,
- u8 type, u64 index)
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
+ int name_len, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
+ struct btrfs_root *root = dir->root;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05dc3c17cb62..b0ab41da91d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -125,8 +125,8 @@ struct async_submit_bio {
* Different roots are used for different purposes and may nest inside each
* other and they require separate keysets. As lockdep keys should be
* static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->objectid. This ensures that all special purpose roots
- * have separate keysets.
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
*
* Lock-nesting across peer nodes is always done with the immediate parent
* node locked thus preventing deadlock. As lockdep doesn't know this, use
@@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
root->orphan_cleanup_state = 0;
- root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
root->nr_delalloc_inodes = 0;
@@ -2156,9 +2155,8 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
rwlock_init(&fs_info->dev_replace.lock);
- atomic_set(&fs_info->dev_replace.read_locks, 0);
atomic_set(&fs_info->dev_replace.blocking_readers, 0);
- init_waitqueue_head(&fs_info->replace_wait);
+ init_waitqueue_head(&fs_info->dev_replace.replace_wait);
init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
@@ -2648,7 +2646,8 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -3309,7 +3308,7 @@ fail_iput:
iput(fs_info->btree_inode);
fail_bio_counter:
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3977,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
+ ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
btrfs_free_qgroup_config(fs_info);
@@ -4018,7 +4018,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
@@ -4204,7 +4204,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
- while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+ while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
@@ -4222,11 +4222,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((n = rb_first(&head->ref_tree)) != NULL) {
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -4240,7 +4240,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1f3755b3a37a..ddf28ecf17f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
fid->gen = inode->i_generation;
if (parent) {
@@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->objectid;
+ parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2d9074295d7f..a1febf155747 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
insert_reserved);
else
BUG();
+ if (ret && insert_reserved)
+ btrfs_pin_extent(trans->fs_info, node->bytenr,
+ node->num_bytes, 1);
return ret;
}
@@ -2374,7 +2377,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
@@ -2387,7 +2390,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
- ref = rb_entry(rb_first(&head->ref_tree),
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
@@ -2448,13 +2451,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&head->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
return 1;
}
delayed_refs->num_heads--;
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -2502,102 +2505,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * Returns 0 on success or if called with an already aborted transaction.
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
- */
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_delayed_ref_head *head = NULL;
+ int ret;
+
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_select_ref_head(delayed_refs);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return head;
+ }
+
+ /*
+ * Grab the lock that says we are going to process all the refs for
+ * this head
+ */
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN)
+ head = ERR_PTR(-EAGAIN);
+
+ return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *locked_ref,
+ unsigned long *run_refs)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
- ktime_t start = ktime_get();
- int ret;
- unsigned long count = 0;
- unsigned long actual_count = 0;
+ struct btrfs_delayed_ref_node *ref;
int must_insert_reserved = 0;
+ int ret;
delayed_refs = &trans->transaction->delayed_refs;
- while (1) {
- if (!locked_ref) {
- if (count >= nr)
- break;
- spin_lock(&delayed_refs->lock);
- locked_ref = btrfs_select_ref_head(trans);
- if (!locked_ref) {
- spin_unlock(&delayed_refs->lock);
- break;
- }
-
- /* grab the lock that says we are going to process
- * all the refs for this head */
- ret = btrfs_delayed_ref_lock(trans, locked_ref);
- spin_unlock(&delayed_refs->lock);
- /*
- * we may have dropped the spin lock to get the head
- * mutex lock, and that might have given someone else
- * time to free the head. If that's true, it has been
- * removed from our list and we can move on.
- */
- if (ret == -EAGAIN) {
- locked_ref = NULL;
- count++;
- continue;
- }
- }
+ lockdep_assert_held(&locked_ref->mutex);
+ lockdep_assert_held(&locked_ref->lock);
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- * Or we can get node references of the same type that weren't
- * merged when created due to bumps in the tree mod seq, and
- * we need to merge them to prevent adding an inline extent
- * backref before dropping it (triggering a BUG_ON at
- * insert_inline_extent_backref()).
- */
- spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-
- ref = select_delayed_ref(locked_ref);
-
- if (ref && ref->seq &&
+ while ((ref = select_delayed_ref(locked_ref))) {
+ if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
- locked_ref = NULL;
- cond_resched();
- count++;
- continue;
- }
-
- /*
- * We're done processing refs in this ref_head, clean everything
- * up and move on to the next ref_head.
- */
- if (!ref) {
- ret = cleanup_ref_head(trans, locked_ref);
- if (ret > 0 ) {
- /* We dropped our lock, we need to loop. */
- ret = 0;
- continue;
- } else if (ret) {
- return ret;
- }
- locked_ref = NULL;
- count++;
- continue;
+ return -EAGAIN;
}
- actual_count++;
+ (*run_refs)++;
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -2619,8 +2586,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
atomic_dec(&delayed_refs->num_entries);
/*
- * Record the must-insert_reserved flag before we drop the spin
- * lock.
+ * Record the must_insert_reserved flag before we drop the
+ * spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
locked_ref->must_insert_reserved = 0;
@@ -2642,10 +2609,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
btrfs_put_delayed_ref(ref);
- count++;
cond_resched();
+
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
}
+ return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long nr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ ktime_t start = ktime_get();
+ int ret;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ do {
+ if (!locked_ref) {
+ locked_ref = btrfs_obtain_ref_head(trans);
+ if (IS_ERR_OR_NULL(locked_ref)) {
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ count++;
+ }
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+ &actual_count);
+ if (ret < 0 && ret != -EAGAIN) {
+ /*
+ * Error, btrfs_run_delayed_refs_for_head already
+ * unlocked everything so just bail out
+ */
+ return ret;
+ } else if (!ret) {
+ /*
+ * Success, perform the usual cleanup of a processed
+ * head
+ */
+ ret = cleanup_ref_head(trans, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
+ continue;
+ } else if (ret) {
+ return ret;
+ }
+ }
+
+ /*
+ * Either success case or btrfs_run_delayed_refs_for_head
+ * returned -EAGAIN, meaning we need to select another head
+ */
+
+ locked_ref = NULL;
+ cond_resched();
+ } while ((nr != -1 && count < nr) || locked_ref);
+
/*
* We don't want to include ref heads since we can have empty ref heads
* and those will drastically skew our runtime down since we just do
@@ -2745,9 +2792,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
@@ -2782,8 +2829,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
{
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
@@ -2791,14 +2837,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
u64 val;
smp_mb();
- avg_runtime = fs_info->avg_delayed_ref_runtime;
+ avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans, fs_info);
+ return btrfs_check_space_for_delayed_refs(trans);
}
struct async_delayed_refs {
@@ -2911,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2928,7 +2973,6 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -2940,7 +2984,7 @@ again:
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first(&delayed_refs->href_root);
+ node = rb_first_cached(&delayed_refs->href_root);
if (!node) {
spin_unlock(&delayed_refs->lock);
goto out;
@@ -2959,7 +3003,6 @@ again:
goto again;
}
out:
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* XXX: We should replace this with a proper search function in the
* future.
*/
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
{
struct btrfs_path *path;
int ret;
- int ret2;
path = btrfs_alloc_path();
if (!path)
@@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
if (ret && ret != -ENOENT)
goto out;
- ret2 = check_delayed_ref(root, path, objectid,
- offset, bytenr);
- } while (ret2 == -EAGAIN);
-
- if (ret2 && ret2 != -ENOENT) {
- ret = ret2;
- goto out;
- }
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+ } while (ret == -EAGAIN);
- if (ret != -ENOENT || ret2 != -ENOENT)
- ret = 0;
out:
btrfs_free_path(path);
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -4533,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
goto out;
} else {
ret = 1;
+ space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
@@ -4554,11 +4590,9 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (u64)SZ_2M) {
+ if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
btrfs_create_pending_block_groups(trans);
- btrfs_trans_release_chunk_metadata(trans);
- }
+
return ret;
}
@@ -5284,7 +5318,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
}
static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int update_size)
+ u64 num_bytes, bool update_size)
{
spin_lock(&block_rsv->lock);
block_rsv->reserved += num_bytes;
@@ -5316,7 +5350,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
- block_rsv_add_bytes(dest, num_bytes, 1);
+ block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
@@ -5479,7 +5513,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
struct btrfs_block_rsv *dst, u64 num_bytes,
- int update_size)
+ bool update_size)
{
int ret;
@@ -5539,10 +5573,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return 0;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
- return 0;
- }
+ if (!ret)
+ block_rsv_add_bytes(block_rsv, num_bytes, true);
return ret;
}
@@ -5587,7 +5619,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
}
@@ -5629,7 +5661,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), num_bytes, 1);
@@ -5835,7 +5867,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
@@ -6399,10 +6431,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
-
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- ram_bytes, 0);
space_info->bytes_may_use -= ram_bytes;
if (delalloc)
cache->delalloc_bytes += num_bytes;
@@ -6424,11 +6452,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
* reserve set to 0 in order to clear the reservation.
*/
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc)
+static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
- int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
@@ -6436,12 +6463,12 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- return ret;
}
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
{
@@ -6925,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree))
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
if (head->extent_op) {
@@ -6946,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* at this point we have a head with no other entries. Go
* ahead and process it.
*/
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
@@ -7233,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
+ u64 max_free_space = 0;
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
@@ -7528,8 +7556,8 @@ unclustered_alloc:
spin_lock(&ctl->tree_lock);
if (ctl->free_space <
num_bytes + empty_cluster + empty_size) {
- if (ctl->free_space > max_extent_size)
- max_extent_size = ctl->free_space;
+ max_free_space = max(max_free_space,
+ ctl->free_space);
spin_unlock(&ctl->tree_lock);
goto loop;
}
@@ -7696,6 +7724,8 @@ loop:
}
out:
if (ret == -ENOSPC) {
+ if (!max_extent_size)
+ max_extent_size = max_free_space;
spin_lock(&space_info->lock);
space_info->max_extent_size = max_extent_size;
spin_unlock(&space_info->lock);
@@ -7977,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
path = btrfs_alloc_path();
- if (!path) {
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
+ if (!path)
return -ENOMEM;
- }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path);
- btrfs_free_and_pin_reserved_extent(fs_info,
- extent_key.objectid,
- fs_info->nodesize);
return ret;
}
@@ -8119,6 +8142,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(fs_info, buf);
@@ -8215,7 +8251,7 @@ try_reserve:
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
- block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_add_bytes(block_rsv, blocksize, false);
block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
}
@@ -8642,7 +8678,13 @@ skip:
parent = 0;
}
- if (need_account) {
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
+ * already accounted them at merge time (replace_path),
+ * thus we could skip expensive subtree trace here.
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
@@ -8763,15 +8805,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(eb));
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level + 1]));
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8779,6 +8820,11 @@ out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
+
+owner_mismatch:
+ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -8832,6 +8878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
+ if (ret < 0)
+ return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
@@ -8875,7 +8923,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
path = btrfs_alloc_path();
if (!path) {
@@ -9613,6 +9661,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
+ wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
@@ -10074,15 +10123,19 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group_cache *block_group;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
- bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
- trans->can_flush_pending_bgs = false;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ if (!trans->can_flush_pending_bgs)
+ return;
+
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
if (ret)
goto next;
@@ -10103,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
next:
list_del_init(&block_group->bg_list);
}
- trans->can_flush_pending_bgs = can_flush_pending_bgs;
+ btrfs_trans_release_chunk_metadata(trans);
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
@@ -10753,14 +10806,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
* We don't want a transaction for this since the discard may take a
* substantial amount of time. We don't require that a transaction be
* running, but we do need to take a running transaction into account
- * to ensure that we're not discarding chunks that were released in
- * the current transaction.
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
*
* Holding the chunks lock will prevent other threads from allocating
* or releasing chunks, but it won't prevent a running transaction
* from committing and releasing the memory that the pending chunks
* list head uses. For that, we need to take a reference to the
- * transaction.
+ * transaction and hold the commit root sem. We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
*/
static int btrfs_trim_free_extents(struct btrfs_device *device,
u64 minlen, u64 *trimmed)
@@ -10770,6 +10825,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
+ /* Discard not supported = nothing to do. */
+ if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ return 0;
+
/* Not writeable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
@@ -10787,9 +10846,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
- return ret;
+ break;
- down_read(&fs_info->commit_root_sem);
+ ret = down_read_killable(&fs_info->commit_root_sem);
+ if (ret) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ break;
+ }
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
@@ -10797,13 +10860,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+
ret = find_free_dev_extent_start(trans, device, minlen, start,
&start, &len);
- if (trans)
+ if (trans) {
+ up_read(&fs_info->commit_root_sem);
btrfs_put_transaction(trans);
+ }
if (ret) {
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret == -ENOSPC)
ret = 0;
@@ -10811,7 +10878,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
}
ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
@@ -10831,6 +10897,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
return ret;
}
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error. The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -10840,18 +10915,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ u64 bg_failed = 0;
+ u64 dev_failed = 0;
+ int bg_ret = 0;
+ int dev_ret = 0;
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
- while (cache) {
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ for (; cache; cache = next_block_group(fs_info, cache)) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
break;
@@ -10865,13 +10936,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
ret = wait_block_group_cache_done(cache);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
ret = btrfs_trim_block_group(cache,
@@ -10882,28 +10955,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
-
- cache = next_block_group(fs_info, cache);
}
+ if (bg_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu block group(s), last error %d",
+ bg_failed, bg_ret);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->alloc_list;
- list_for_each_entry(device, devices, dev_alloc_list) {
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
- if (ret)
+ if (ret) {
+ dev_failed++;
+ dev_ret = ret;
break;
+ }
trimmed += group_trimmed;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (dev_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu device(s), last error %d",
+ dev_failed, dev_ret);
range->len = trimmed;
- return ret;
+ if (bg_ret)
+ return bg_ret;
+ return dev_ret;
}
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..d228f706ff3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
struct extent_state *state;
- struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
- n = rb_next(&state->rb_node);
- while (n) {
- state = rb_entry(n, struct extent_state,
- rb_node);
+ while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
- n = rb_next(n);
}
free_extent_state(*cached_state);
*cached_state = NULL;
@@ -1568,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-STATIC u64 find_lock_delalloc_range(struct inode *inode,
+static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes)
@@ -1648,6 +1643,17 @@ out_failed:
return found;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
+{
+ return find_lock_delalloc_range(inode, tree, locked_page, start, end,
+ max_bytes);
+}
+#endif
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
@@ -3778,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -3903,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
@@ -5153,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
@@ -5165,11 +5169,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_buffer *eb)
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
- int was_dirty = 0;
+ bool was_dirty;
check_buffer_tree_ref(eb);
@@ -5179,8 +5183,15 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ if (!was_dirty)
+ for (i = 0; i < num_pages; i++)
+ set_page_dirty(eb->pages[i]);
+
+#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ ASSERT(PageDirty(eb->pages[i]));
+#endif
+
return was_dirty;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e677e1d..369daa5d4f73 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_buffer *eb);
+bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
@@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-noinline u64 find_lock_delalloc_range(struct inode *inode,
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55e5339..7eea8b6e2cd3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -34,7 +34,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT;
+ tree->map = RB_ROOT_CACHED;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static int tree_insert(struct rb_root *root, struct extent_map *em)
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
+ bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start)
+ if (em->start < entry->start) {
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ } else if (em->start >= extent_map_end(entry)) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return -EEXIST;
+ }
}
orig_parent = parent;
@@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color(&em->rb_node, root);
+ rb_insert_color_cached(&em->rb_node, root, leftmost);
return 0;
}
@@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
@@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
@@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
if (!rb_node) {
if (prev)
rb_node = prev;
@@ -428,16 +431,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
*/
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
- int ret = 0;
-
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- rb_erase(&em->rb_node, &tree->map);
+ rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
- return ret;
}
void replace_extent_mapping(struct extent_map_tree *tree,
@@ -449,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
list_del_init(&cur->list);
- rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(tree, new, modified);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 25d985e7532a..31977ffd6190 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,7 +49,7 @@ struct extent_map {
};
struct extent_map_tree {
- struct rb_root map;
+ struct rb_root_cached map;
struct list_head modified_extents;
rwlock_t lock;
};
@@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *cur,
struct extent_map *new,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..97c7a086f7bd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
end_of_last_block = start_pos + num_bytes - 1;
+ /*
+ * The pages may have already been dirty, clear out old accounting so
+ * we can set things up properly
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (start_pos >= isize &&
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
- clear_extent_bit(&inode->io_tree, start_pos, last_pos,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state);
+
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
+ /*
+ * It's possible the pages are dirty right now, but we don't want
+ * to clean them yet because copy_from_user may catch a page fault
+ * and we might have to fall back to one page at a time. If that
+ * happens, we'll unlock these pages and we'd have a window where
+ * reclaim could sneak in and drop the once-dirty page on the floor
+ * without writing it.
+ *
+ * We have the pages locked and the extent range locked, so there's
+ * no way someone can start IO on any dirty pages in this range.
+ *
+ * We'll call btrfs_dirty_pages() later on, and that will flip around
+ * delalloc bits and dirty the pages as required.
+ */
for (i = 0; i < num_pages; i++) {
- if (clear_page_dirty_for_io(pages[i]))
- account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -2061,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
inode_lock(inode);
+
+ /*
+ * We take the dio_sem here because the tree log stuff can race with
+ * lockless dio writes and get an extent map logged for an extent we
+ * never waited on. We need it this high up for lockdep reasons.
+ */
+ down_write(&BTRFS_I(inode)->dio_sem);
+
atomic_inc(&root->log_batch);
/*
@@ -2069,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2092,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2110,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2131,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
/*
@@ -2544,7 +2573,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2594,7 +2623,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0adf38b00fa0..4ba0aedc878b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -10,6 +10,7 @@
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct inode *inode = NULL;
+ unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
+ /*
+ * We are often under a trans handle at this point, so we need to make
+ * sure NOFS is set to keep us from deadlocking.
+ */
+ nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
@@ -1679,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1762,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
return -1;
}
+static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
+{
+ if (entry->bitmap)
+ return entry->max_extent_size;
+ return entry->bytes;
+}
+
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
@@ -1783,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1802,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
}
if (entry->bytes < *bytes + align_off) {
- if (entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
continue;
}
@@ -1815,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*offset = tmp;
*bytes = size;
return entry;
- } else if (size > *max_extent_size) {
- *max_extent_size = size;
+ } else {
+ *max_extent_size =
+ max(get_max_extent_size(entry),
+ *max_extent_size);
}
continue;
}
@@ -2110,8 +2129,7 @@ new_bitmap:
out:
if (info) {
- if (info->bitmap)
- kfree(info->bitmap);
+ kfree(info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}
@@ -2440,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
struct rb_node *n;
int count = 0;
+ spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
@@ -2448,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
+ spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
@@ -2676,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
- if (search_bytes > *max_extent_size)
- *max_extent_size = search_bytes;
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
return 0;
}
@@ -2714,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
- if (entry->bytes < bytes && entry->bytes > *max_extent_size)
- *max_extent_size = entry->bytes;
+ if (entry->bytes < bytes)
+ *max_extent_size = max(get_max_extent_size(entry),
+ *max_extent_size);
if (entry->bytes < bytes ||
(!entry->bitmap && entry->offset < min_start)) {
@@ -3601,8 +3622,7 @@ again:
if (info)
kmem_cache_free(btrfs_free_space_cachep, info);
- if (map)
- kfree(map);
+ kfree(map);
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..d3df5b52278c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
-static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
static const struct extent_io_ops btrfs_extent_io_ops;
@@ -503,6 +502,7 @@ again:
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
+ nr_pages = 0;
goto cont;
}
@@ -2750,12 +2750,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
struct btrfs_path *path;
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
- struct inode *inode;
struct rb_node *node;
int ret;
- inode = new->inode;
-
path = btrfs_alloc_path();
if (!path)
return;
@@ -2944,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
+ bool clear_reserved_extent = true;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3047,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
- if (!ret)
+ if (!ret) {
+ clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->start,
ordered_extent->disk_len);
+ }
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -3111,8 +3111,13 @@ out:
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
+ *
+ * If we made it past insert_reserved_file_extent before we
+ * errored out then we don't need to do this as the accounting
+ * has already been done.
*/
if ((ret || !logical_len) &&
+ clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(fs_info,
@@ -3471,8 +3476,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
- if (ret)
- goto out;
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3738,7 +3741,7 @@ cache_acl:
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
@@ -3910,12 +3913,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto err;
- }
- if (!di) {
- ret = -ENOENT;
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
leaf = path->nodes[0];
@@ -4075,10 +4074,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -4270,18 +4266,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* again is not run concurrently.
*/
spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- if (dest->send_in_progress == 0) {
- btrfs_set_root_flags(&dest->root_item,
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- } else {
+ if (dest->send_in_progress) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
return -EPERM;
}
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
down_write(&fs_info->subvol_sem);
@@ -4727,7 +4722,7 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (btrfs_should_throttle_delayed_refs(trans, fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
trans->transid, 0);
@@ -4736,8 +4731,7 @@ delete:
extent_num_bytes)) {
should_end = true;
}
- if (btrfs_should_throttle_delayed_refs(trans,
- fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
}
@@ -5235,10 +5229,10 @@ static void evict_inode_truncate_pages(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map)) {
+ while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
struct extent_map *em;
- node = rb_first(&map_tree->map);
+ node = rb_first_cached(&map_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -5274,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
+ state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
@@ -5291,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
- if (state->state & EXTENT_DELALLOC)
+ if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
clear_extent_bit(io_tree, start, end,
@@ -5306,8 +5302,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- u64 min_size)
+ struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5317,7 +5312,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5334,8 +5329,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+ if (!btrfs_check_space_for_delayed_refs(trans) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
return trans;
/* If not, commit and try again. */
@@ -5351,7 +5346,6 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- u64 min_size;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5361,8 +5355,6 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
- min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
-
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
@@ -5373,9 +5365,6 @@ void btrfs_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
- /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- if (!special_file(inode->i_mode))
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
@@ -5395,13 +5384,13 @@ void btrfs_evict_inode(struct inode *inode)
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
- rsv->size = min_size;
+ rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1);
rsv->failfast = 1;
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
@@ -5426,7 +5415,7 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (!IS_ERR(trans)) {
trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -5471,12 +5460,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -5790,16 +5775,10 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct inode *inode;
-
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode)) {
- if (PTR_ERR(inode) == -ENOENT)
- inode = NULL;
- else
- return ERR_CAST(inode);
- }
+ struct inode *inode = btrfs_lookup_dentry(dir, dentry);
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
return d_splice_alias(inode, dentry);
}
@@ -6390,8 +6369,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode, &key,
+ ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
@@ -6584,7 +6562,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->objectid != BTRFS_I(inode)->root->objectid)
+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6777,9 +6755,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* This also copies inline extents directly into the page.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
@@ -6823,19 +6801,21 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
+ path = btrfs_alloc_path();
if (!path) {
- path = btrfs_alloc_path();
- if (!path) {
- err = -ENOMEM;
- goto out;
- }
- /*
- * Chances are we'll be called again, so go ahead and do
- * readahead
- */
- path->reada = READA_FORWARD;
+ err = -ENOMEM;
+ goto out;
}
+ /* Chances are we'll be called again, so go ahead and do readahead */
+ path->reada = READA_FORWARD;
+
+ /*
+ * Unless we're going to uncompress the inline extent, no sleep would
+ * happen.
+ */
+ path->leave_spinning = 1;
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
err = ret;
@@ -6938,6 +6918,8 @@ next:
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+
+ btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
@@ -6985,10 +6967,10 @@ insert:
err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
+ btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- btrfs_free_path(path);
if (err) {
free_extent_map(em);
return ERR_PTR(err);
@@ -9021,7 +9003,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
/*
@@ -9058,7 +9040,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_block_rsv_release(fs_info, rsv, -1);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -10191,7 +10173,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -10567,13 +10549,6 @@ static const struct address_space_operations btrfs_aops = {
.error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations btrfs_symlink_aops = {
- .readpage = btrfs_readpage,
- .writepage = btrfs_writepage,
- .invalidatepage = btrfs_invalidatepage,
- .releasepage = btrfs_releasepage,
-};
-
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..a990a9045139 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
@@ -686,8 +689,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- ret = btrfs_insert_dir_item(trans, root,
- name, namelen, BTRFS_I(dir), &key,
+ ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
BTRFS_FT_DIR, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1324,7 +1326,7 @@ again:
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
@@ -4393,7 +4395,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->objectid)) {
+ if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d4917c0cddf5..45868fd76209 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1416,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (!qgroup) {
ret = -ENOENT;
goto out;
- } else {
- /* check if there are no children of this qgroup */
- if (!list_empty(&qgroup->members)) {
- ret = -EBUSY;
- goto out;
- }
}
+
+ /* Check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
ret = del_qgroup_item(trans, qgroupid);
if (ret && ret != -ENOENT)
goto out;
@@ -1712,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
return 0;
}
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ * OO = Old tree blocks
+ * NN = New tree blocks allocated during balance
+ *
+ * File tree (257) Reloc tree for 257
+ * L2 OO NN
+ * / \ / \
+ * L1 OO OO (a) OO NN (a)
+ * / \ / \ / \ / \
+ * L0 OO OO OO OO OO OO NN NN
+ * (b) (c) (b) (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ * It should acts as a simplified btrfs_search_slot().
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
+ * (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ * They should be marked during preivous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ * We don't have good way to pick out new file extents only.
+ * So we still follow the old method by scanning all file extents in
+ * the leave.
+ *
+ * This function can free us from keeping two pathes, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int dst_level, int root_level,
+ bool trace_leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_path *src_path;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u32 nodesize = fs_info->nodesize;
+ int cur_level = root_level;
+ int ret;
+
+ BUG_ON(dst_level > root_level);
+ /* Level mismatch */
+ if (btrfs_header_level(src_eb) != root_level)
+ return -EINVAL;
+
+ src_path = btrfs_alloc_path();
+ if (!src_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dst_level)
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+ else
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+ /* For src_path */
+ extent_buffer_get(src_eb);
+ src_path->nodes[root_level] = src_eb;
+ src_path->slots[root_level] = dst_path->slots[root_level];
+ src_path->locks[root_level] = 0;
+
+ /* A simplified version of btrfs_search_slot() */
+ while (cur_level >= dst_level) {
+ struct btrfs_key src_key;
+ struct btrfs_key dst_key;
+
+ if (src_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ struct extent_buffer *eb;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ eb = src_path->nodes[cur_level + 1];
+ parent_slot = src_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ src_path->nodes[cur_level] = eb;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ }
+
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
+ if (cur_level) {
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ } else {
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ }
+ /* Content mismatch, something went wrong */
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ cur_level--;
+ }
+
+ /*
+ * Now both @dst_path and @src_path have been populated, record the tree
+ * blocks for qgroup accounting.
+ */
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_extent(trans,
+ dst_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ /* Record leaf file extents */
+ if (dst_level == 0 && trace_leaf) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+ }
+out:
+ btrfs_free_path(src_path);
+ return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ * reloc tree
+ * L2 NN (a)
+ * / \
+ * L1 OO NN (b)
+ * / \ / \
+ * L0 OO OO OO NN
+ * (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int cur_level, int root_level,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *eb;
+ bool need_cleanup = false;
+ int ret = 0;
+ int i;
+
+ /* Level sanity check */
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ root_level < cur_level) {
+ btrfs_err_rl(fs_info,
+ "%s: bad levels, cur_level=%d root_level=%d",
+ __func__, cur_level, root_level);
+ return -EUCLEAN;
+ }
+
+ /* Read the tree block if needed */
+ if (dst_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /*
+ * dst_path->nodes[root_level] must be initialized before
+ * calling this function.
+ */
+ if (cur_level == root_level) {
+ btrfs_err_rl(fs_info,
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+ __func__, root_level, root_level, cur_level);
+ return -EUCLEAN;
+ }
+
+ /*
+ * We need to get child blockptr/gen from parent before we can
+ * read it.
+ */
+ eb = dst_path->nodes[cur_level + 1];
+ parent_slot = dst_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ /* This node is old, no need to trace */
+ if (child_gen < last_snapshot)
+ goto out;
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ dst_path->nodes[cur_level] = eb;
+ dst_path->slots[cur_level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ need_cleanup = true;
+ }
+
+ /* Now record this tree block and its counter part for qgroups */
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+ root_level, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+
+ eb = dst_path->nodes[cur_level];
+
+ if (cur_level > 0) {
+ /* Iterate all child tree blocks */
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ /* Skip old tree blocks as they won't be swapped */
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+ continue;
+ dst_path->slots[cur_level] = i;
+
+ /* Recursive call (at most 7 times) */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+ dst_path, cur_level - 1, root_level,
+ last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (need_cleanup) {
+ /* Clean up */
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+ dst_path->locks[cur_level]);
+ free_extent_buffer(dst_path->nodes[cur_level]);
+ dst_path->nodes[cur_level] = NULL;
+ dst_path->slots[cur_level] = 0;
+ dst_path->locks[cur_level] = 0;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Inform qgroup to trace subtree swap used in balance.
+ *
+ * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
+ * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
+ *
+ * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
+ * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
+ * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
+ * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
+ * and skip all tree blocks whose generation is smaller than last_snapshot.
+ *
+ * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
+ * which could be the cause of very slow balance if the file tree is large.
+ *
+ * @src_parent, @src_slot: pointer to src (file tree) eb.
+ * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
+ */
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *dst_path = NULL;
+ struct btrfs_key first_key;
+ struct extent_buffer *src_eb = NULL;
+ struct extent_buffer *dst_eb = NULL;
+ bool trace_leaf = false;
+ u64 child_gen;
+ u64 child_bytenr;
+ int level;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /* Check parameter order */
+ if (btrfs_node_ptr_generation(src_parent, src_slot) >
+ btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+ btrfs_node_ptr_generation(src_parent, src_slot),
+ btrfs_node_ptr_generation(dst_parent, dst_slot));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Only trace leaf if we're relocating data block groups, this could
+ * reduce tons of data extents tracing for meta/sys bg relocation.
+ */
+ if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
+ trace_leaf = true;
+ /* Read out real @src_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
+ child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
+ btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
+
+ src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(src_parent) - 1, &first_key);
+ if (IS_ERR(src_eb)) {
+ ret = PTR_ERR(src_eb);
+ goto out;
+ }
+
+ /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
+ child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
+ btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
+
+ dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(dst_parent) - 1, &first_key);
+ if (IS_ERR(dst_eb)) {
+ ret = PTR_ERR(dst_eb);
+ goto out;
+ }
+
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ level = btrfs_header_level(dst_eb);
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* For dst_path */
+ extent_buffer_get(dst_eb);
+ dst_path->nodes[level] = dst_eb;
+ dst_path->slots[level] = 0;
+ dst_path->locks[level] = 0;
+
+ /* Do the generation-aware breadth-first search */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+ level, last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ free_extent_buffer(src_eb);
+ free_extent_buffer(dst_eb);
+ btrfs_free_path(dst_path);
+ if (ret < 0)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2132,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
+ u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -2141,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
@@ -2186,6 +2599,8 @@ cleanup:
kfree(record);
}
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+ num_dirty_extents);
return ret;
}
@@ -2897,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3004,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
- !is_fstree(root->objectid) || len == 0)
+ !is_fstree(root->root_key.objectid) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -3090,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
@@ -3106,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
+ &BTRFS_I(inode)->root->fs_info->flags))
+ return 0;
+
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
@@ -3122,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3215,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid) || num_bytes == 0)
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -3240,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -3256,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/*
@@ -3267,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
+ num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -3321,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->objectid, num_bytes);
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
}
/*
@@ -3354,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
inode->i_ino, unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 54b8bb282c0e..d8f78f5ab854 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -236,6 +236,12 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
+
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -249,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return;
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
BTRFS_QGROUP_RSV_DATA);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e5b9e596bb92..d69fbfb30aa9 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
INIT_LIST_HEAD(&ra->list);
ra->action = action;
- ra->root = root->objectid;
+ ra->root = root->root_key.objectid;
/*
* This is an allocation, preallocate the block_entry in case we haven't
@@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* one we want to lookup below when we modify the
* re->num_refs.
*/
- ref_root = root->objectid;
- re->root_objectid = root->objectid;
+ ref_root = root->root_key.objectid;
+ re->root_objectid = root->root_key.objectid;
re->num_refs = 0;
}
@@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* didn't thik of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
- root->objectid, be->bytenr);
+ root->root_key.objectid, be->bytenr);
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ra);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8783a1776540..924116f654a1 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
int level, u64 bytenr)
{
struct backref_cache *cache = &rc->backref_cache;
- struct btrfs_path *path1;
- struct btrfs_path *path2;
+ struct btrfs_path *path1; /* For searching extent root */
+ struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
struct extent_buffer *eb;
struct btrfs_root *root;
struct backref_node *cur;
@@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
struct btrfs_key key;
unsigned long end;
unsigned long ptr;
- LIST_HEAD(list);
+ LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
LIST_HEAD(useless);
int cowonly;
int ret;
@@ -778,6 +778,10 @@ again:
key.type != BTRFS_SHARED_BLOCK_REF_KEY);
}
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref.
+ */
if (exist &&
((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
exist->owner == key.offset) ||
@@ -787,11 +791,12 @@ again:
goto next;
}
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
if (key.objectid == key.offset) {
/*
- * only root blocks of reloc trees use
- * backref of this type.
+ * Only root blocks of reloc trees use backref
+ * pointing to itself.
*/
root = find_reloc_root(rc, cur->bytenr);
ASSERT(root);
@@ -840,7 +845,11 @@ again:
goto next;
}
- /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -863,10 +872,7 @@ again:
level = cur->level + 1;
- /*
- * searching the tree to find upper level blocks
- * reference the block.
- */
+ /* Search the tree to find parent blocks referring the block. */
path2->search_commit_root = 1;
path2->skip_locking = 1;
path2->lowest_level = level;
@@ -884,7 +890,8 @@ again:
cur->bytenr) {
btrfs_err(root->fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->objectid,
+ cur->bytenr, level - 1,
+ root->root_key.objectid,
node_key->objectid, node_key->type,
node_key->offset);
err = -ENOENT;
@@ -892,6 +899,8 @@ again:
}
lower = cur;
need_check = true;
+
+ /* Add all nodes and edges in the path */
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
ASSERT(btrfs_root_bytenr(&root->root_item) ==
@@ -1281,7 +1290,7 @@ static void __del_reloc_root(struct btrfs_root *root)
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
- if (rc) {
+ if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
root->node->start);
@@ -1735,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* errors, a negative error number is returned.
*/
static noinline_for_stack
-int replace_path(struct btrfs_trans_handle *trans,
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_root *dest, struct btrfs_root *src,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
@@ -1879,14 +1888,9 @@ again:
* and tree block numbers, if current trans doesn't free
* data reloc tree inode.
*/
- ret = btrfs_qgroup_trace_subtree(trans, parent,
- btrfs_header_generation(parent),
- btrfs_header_level(parent));
- if (ret < 0)
- break;
- ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level],
- btrfs_header_generation(path->nodes[level]),
- btrfs_header_level(path->nodes[level]));
+ ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
+ parent, slot, path->nodes[level],
+ path->slots[level], last_snapshot);
if (ret < 0)
break;
@@ -2205,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
} else {
- ret = replace_path(trans, root, reloc_root, path,
+ ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
if (ret < 0) {
@@ -2911,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- WARN_ON(btrfs_header_level(eb) != block->level);
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
else
@@ -2987,7 +2990,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
- struct rb_node *rb_node;
+ struct tree_block *next;
int ret;
int err = 0;
@@ -2997,29 +3000,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
goto out_free_blocks;
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Kick in readahead for tree blocks with missing keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
readahead_tree_block(fs_info, block->bytenr);
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Get first keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
err = get_tree_block_key(fs_info, block);
if (err)
goto out_free_path;
}
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
-
+ /* Do tree relocation */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@@ -3030,11 +3027,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
err = ret;
goto out;
}
- rb_node = rb_next(rb_node);
}
out:
err = finish_pending_nodes(trans, rc, path, err);
@@ -4669,7 +4665,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated, 1);
+ rc->nodes_relocated, true);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..902819d3cf41 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
- btrfs_dev_replace_stats_inc(
+ atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
@@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
if (btrfsic_submit_bio_wait(bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
@@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
@@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
- btrfs_dev_replace_stats_inc(&dev_replace->
- num_write_errors);
+ atomic64_inc(&dev_replace->num_write_errors);
}
}
@@ -3022,8 +3019,7 @@ out:
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length,
- int is_dev_replace)
+ int num, u64 base, u64 length)
{
struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3299,7 +3295,7 @@ again:
extent_physical = extent_logical - logical + physical;
extent_dev = scrub_dev;
extent_mirror_num = mirror_num;
- if (is_dev_replace)
+ if (sctx->is_dev_replace)
scrub_remap_extent(fs_info, extent_logical,
extent_len, &extent_physical,
&extent_dev,
@@ -3397,8 +3393,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache,
- int is_dev_replace)
+ struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
@@ -3435,8 +3430,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length,
- is_dev_replace);
+ chunk_offset, length);
if (ret)
goto out;
}
@@ -3449,8 +3443,7 @@ out:
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
- struct btrfs_device *scrub_dev, u64 start, u64 end,
- int is_dev_replace)
+ struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
@@ -3544,7 +3537,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(cache);
- if (!ret && is_dev_replace) {
+ if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
* that started dellaloc right before we set the block
@@ -3609,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache, is_dev_replace);
+ found_key.offset, cache);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3670,7 +3663,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_put_block_group(cache);
if (ret)
break;
- if (is_dev_replace &&
+ if (sctx->is_dev_replace &&
atomic64_read(&dev_replace->num_write_errors) > 0) {
ret = -EIO;
break;
@@ -3893,8 +3886,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
if (!ret)
- ret = scrub_enumerate_chunks(sctx, dev, start, end,
- is_dev_replace);
+ ret = scrub_enumerate_chunks(sctx, dev, start, end);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ba8950bfd9c7..094cc1444a90 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
struct clone_root *cr = (struct clone_root *)elt;
- if (root < cr->root->objectid)
+ if (root < cr->root->root_key.objectid)
return -1;
- if (root > cr->root->objectid)
+ if (root > cr->root->root_key.objectid)
return 1;
return 0;
}
@@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
struct clone_root *cr1 = (struct clone_root *)e1;
struct clone_root *cr2 = (struct clone_root *)e2;
- if (cr1->root->objectid < cr2->root->objectid)
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
- if (cr1->root->objectid > cr2->root->objectid)
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
return 1;
return 0;
}
@@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
di = btrfs_lookup_dir_item(NULL, root, path,
dir, name, name_len, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
@@ -2346,7 +2342,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->objectid;
+ key.objectid = send_root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2362,7 +2358,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->objectid) {
+ key.objectid != send_root->root_key.objectid) {
ret = -ENOENT;
goto out;
}
@@ -4907,8 +4903,8 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->objectid, clone_root->ino,
- clone_root->offset);
+ offset, len, clone_root->root->root_key.objectid,
+ clone_root->ino, clone_root->offset);
p = fs_path_alloc();
if (!p)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6601c9aa5e35..b362b45dd757 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
- buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+ buf->f_fsid.val[0] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
+ buf->f_fsid.val[1] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid;
return 0;
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d9269a531a4d..9e0f4a01be14 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("should have found at least one delalloc");
@@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (found) {
test_err("found range when we shouldn't have");
@@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 385a5316e4bf..bf15d3a7f20e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
- while (!RB_EMPTY_ROOT(&em_tree->map)) {
- node = rb_first(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+ node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
remove_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..d1eeef9ec5da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.href_root.rb_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -118,7 +119,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->objectid))
+ if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
}
@@ -245,7 +246,7 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -759,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (btrfs_check_space_for_delayed_refs(trans, fs_info))
+ if (btrfs_check_space_for_delayed_refs(trans))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -834,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans, info);
+ btrfs_should_throttle_delayed_refs(trans);
cur = max_t(unsigned long, cur, 32);
/*
@@ -1197,7 +1198,10 @@ again:
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
- btrfs_after_dev_replace_commit(fs_info);
+
+ /* Update dev-replace pointer once everything is committed */
+ fs_info->dev_replace.committed_cursor_left =
+ fs_info->dev_replace.cursor_left_last_write_of_item;
return 0;
}
@@ -1613,10 +1617,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
- ret = btrfs_insert_dir_item(trans, parent_root,
- dentry->d_name.name, dentry->d_name.len,
- BTRFS_I(parent_inode), &key,
- BTRFS_FT_DIR, index);
+ ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
+ dentry->d_name.len, BTRFS_I(parent_inode),
+ &key, BTRFS_FT_DIR, index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@@ -1929,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
@@ -1938,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
cur_trans = trans->transaction;
/*
@@ -2280,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- /*
- * If fs has been frozen, we can not handle delayed iputs, otherwise
- * it'll result in deadlock about SB_FREEZE_FS.
- */
- if (current != fs_info->transaction_kthread &&
- current != fs_info->cleaner_kthread &&
- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
- btrfs_run_delayed_iputs(fs_info);
-
return ret;
scrub_continue:
@@ -2330,7 +2324,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index db835635372f..cab0b1f1f741 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
u32 nritems = btrfs_header_nritems(leaf);
int slot;
+ if (btrfs_header_level(leaf) != 0) {
+ generic_err(fs_info, leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
int slot;
+ int level = btrfs_header_level(node);
u64 bytenr;
int ret = 0;
+ if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ generic_err(fs_info, node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3c2ae0e4f25a..e07f3376b7df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root)
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -258,6 +255,13 @@ struct walk_control {
/* what stage of the replay code we're currently in */
int stage;
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
@@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
@@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root->fs_info->sectorsize);
ret = btrfs_drop_extents(wc->trans, root, inode,
from, (u64)-1, 1);
- /*
- * If the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done.
- */
if (!ret) {
- if (inode->i_nlink == 0)
- inc_nlink(inode);
- /* Update link count and nbytes. */
+ /* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, inode);
}
@@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
}
+ if (wc->ignore_cur_inode)
+ continue;
+
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
@@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
};
ret = walk_log_tree(trans, log, &wc);
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -4374,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&extents);
- down_write(&inode->dio_sem);
write_lock(&tree->lock);
test_gen = root->fs_info->last_trans_committed;
logged_start = start;
@@ -4440,7 +4455,6 @@ process:
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- up_write(&inode->dio_sem);
btrfs_release_path(path);
if (!ret)
@@ -4636,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
ASSERT(len == i_size ||
(len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
- BTRFS_COMPRESS_NONE));
+ BTRFS_COMPRESS_NONE) ||
+ (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
@@ -5564,9 +5579,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (ctx)
ctx->log_new_dentries = false;
@@ -5640,7 +5679,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (btrfs_inode_in_log(inode, trans->transid)) {
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if (btrfs_inode_in_log(inode, trans->transid) ||
+ inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7ab9bb88a639..767765031e59 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,7 +65,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
-int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
int for_rename);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4405e430da6..f435d397019e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map);
+ n = rb_last(&em_tree->map.rb_root);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
@@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
fs_info->fs_devices->latest_bdev = next_device->bdev;
}
+/*
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
+ * currently replaced.
+ */
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
+{
+ u64 num_devices = fs_info->fs_devices->num_devices;
+
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ ASSERT(num_devices > 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+
+ return num_devices;
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid)
{
@@ -1865,22 +1883,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_lock(&uuid_mutex);
- num_devices = fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- WARN_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
- &device);
- if (ret)
+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+
+ if (IS_ERR(device)) {
+ if (PTR_ERR(device) == -ENOENT &&
+ strcmp(device_path, "missing") == 0)
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+ else
+ ret = PTR_ERR(device);
goto out;
+ }
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
@@ -2096,9 +2114,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
call_rcu(&tgtdev->rcu, free_device_rcu);
}
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
int ret = 0;
struct btrfs_super_block *disk_super;
@@ -2106,28 +2123,27 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
+ struct btrfs_device *device;
- *device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
- return ret;
+ return ERR_PTR(ret);
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
brelse(bh);
- if (!*device)
- ret = -ENOENT;
+ if (!device)
+ device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return ret;
+ return device;
}
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_missing_or_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
- *device = NULL;
+ struct btrfs_device *device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
@@ -2136,42 +2152,38 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
- *device = tmp;
+ device = tmp;
break;
}
}
- if (!*device)
- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-
- return 0;
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
- return btrfs_find_device_by_path(fs_info, device_path, device);
+ device = btrfs_find_device_by_path(fs_info, device_path);
}
+
+ return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device)
+struct btrfs_device *btrfs_find_device_by_devspec(
+ struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
{
- int ret;
+ struct btrfs_device *device;
if (devid) {
- ret = 0;
- *device = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!*device)
- ret = -ENOENT;
+ device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
if (!devpath || !devpath[0])
- return -EINVAL;
-
- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
- device);
+ return ERR_PTR(-EINVAL);
+ device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
}
- return ret;
+ return device;
}
/*
@@ -3679,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return (flags & (flags - 1)) == 0;
+ return is_power_of_2(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- BUG_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
+
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -5897,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
out:
if (dev_replace_is_ongoing) {
- btrfs_dev_replace_clear_lock_blocking(dev_replace);
+ ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
+ btrfs_dev_replace_read_lock(dev_replace);
+ /* Barrier implied by atomic_dec_and_test */
+ if (atomic_dec_and_test(&dev_replace->blocking_readers))
+ cond_wake_up_nomb(&dev_replace->read_lock_wq);
btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
@@ -7438,7 +7449,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
int ret = 0;
read_lock(&em_tree->lock);
- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
em = rb_entry(node, struct extent_map, rb_node);
if (em->map_lookup->num_stripes !=
em->map_lookup->verified_stripes) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 23e9285d88de..aefce895e994 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -410,12 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device);
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device);
+struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
+ u64 devid,
+ const char *devpath);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6f1ae3ac9789..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
@@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping,
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
@@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ * its backing page dirty, then tag the page as dirty in the page cache
+ * and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
@@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
*/
bio = bio_alloc(GFP_NOIO, 1);
- if (wbc) {
- wbc_init_bio(wbc, bio);
- wbc_account_io(wbc, bh->b_page, bh->b_size);
- }
-
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
@@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
submit_bio(bio);
return 0;
}
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 027408d55aee..5f0103f40079 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+ if (ceph_snap(inode) != CEPH_NOSNAP) {
+ ret = -EROFS;
+ goto out;
+ }
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out_free;
}
- if (ceph_snap(inode) != CEPH_NOSNAP) {
- ret = -EROFS;
- goto out_free;
- }
-
if (new_mode != old_mode) {
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
@@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
- pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
if (!pagelist)
goto out_err;
- ceph_pagelist_init(pagelist);
err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
if (err)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9c332a6f6667..8eade7a993c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
/* caller of readpages does not hold buffer and read caps
* (fadvise, madvise and readahead cases) */
int want = CEPH_CAP_FILE_CACHE;
- ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+ ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
dout("start_read %p, error getting cap\n", inode);
} else if (!(got & want)) {
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index dd7dfdd2ba13..f3496db4bb3e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
- struct ceph_inode_info *ci)
+ struct ceph_inode_info *ci,
+ bool set_timeout)
{
- __cap_set_timeouts(mdsc, ci);
dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
@@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
+ if (set_timeout)
+ __cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
@@ -720,7 +722,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -1647,7 +1649,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
return dirty;
}
@@ -2065,7 +2067,7 @@ ack:
/* Reschedule delayed caps release if we delayed anything */
if (delayed)
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, false);
spin_unlock(&ci->i_ceph_lock);
@@ -2125,7 +2127,7 @@ retry:
if (delayed) {
spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ __cap_delay_requeue(mdsc, ci, true);
spin_unlock(&ci->i_ceph_lock);
}
} else {
@@ -2671,17 +2673,18 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
+ bool nonblock, int *got)
{
int ret, err = 0;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
- BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+ BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
- ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
+ ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err);
if (ret) {
if (err == -EAGAIN) {
ret = 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5dd433aa9b23..27cad84dab23 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/striper.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -557,90 +558,26 @@ enum {
};
/*
- * Read a range of bytes striped over one or more objects. Iterate over
- * objects we stripe over. (That's not atomic, but good enough for now.)
- *
- * If we get a short result from the OSD, check against i_size; we need to
- * only return a short read to the caller if we hit EOF.
- */
-static int striped_read(struct inode *inode,
- u64 pos, u64 len,
- struct page **pages, int num_pages,
- int page_align, int *checkeof)
-{
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 this_len;
- loff_t i_size;
- int page_idx;
- int ret, read = 0;
- bool hit_stripe, was_short;
-
- /*
- * we may need to do multiple reads. not atomic, unfortunately.
- */
-more:
- this_len = len;
- page_idx = (page_align + read) >> PAGE_SHIFT;
- ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
- &ci->i_layout, pos, &this_len,
- ci->i_truncate_seq, ci->i_truncate_size,
- pages + page_idx, num_pages - page_idx,
- ((page_align + read) & ~PAGE_MASK));
- if (ret == -ENOENT)
- ret = 0;
- hit_stripe = this_len < len;
- was_short = ret >= 0 && ret < this_len;
- dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
- ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
-
- i_size = i_size_read(inode);
- if (ret >= 0) {
- if (was_short && (pos + ret < i_size)) {
- int zlen = min(this_len - ret, i_size - pos - ret);
- int zoff = page_align + read + ret;
- dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + zlen);
- ceph_zero_page_vector_range(zoff, zlen, pages);
- ret += zlen;
- }
-
- read += ret;
- pos += ret;
- len -= ret;
-
- /* hit stripe and need continue*/
- if (len && hit_stripe && pos < i_size)
- goto more;
- }
-
- if (read > 0) {
- ret = read;
- /* did we bounce off eof? */
- if (pos + len > i_size)
- *checkeof = CHECK_EOF;
- }
-
- dout("striped_read returns %d\n", ret);
- return ret;
-}
-
-/*
* Completely synchronous read and write methods. Direct from __user
* buffer to osd, or directly to user pages (if O_DIRECT).
*
- * If the read spans object boundary, just do multiple reads.
+ * If the read spans object boundary, just do multiple reads. (That's not
+ * atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
*/
static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
- int *checkeof)
+ int *retry_op)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct page **pages;
- u64 off = iocb->ki_pos;
- int num_pages;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
ssize_t ret;
- size_t len = iov_iter_count(to);
+ u64 off = iocb->ki_pos;
+ u64 len = iov_iter_count(to);
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
@@ -653,61 +590,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait_range(inode->i_mapping, off,
- off + len);
+ ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
if (ret < 0)
return ret;
- if (unlikely(iov_iter_is_pipe(to))) {
+ ret = 0;
+ while ((len = iov_iter_count(to)) > 0) {
+ struct ceph_osd_request *req;
+ struct page **pages;
+ int num_pages;
size_t page_off;
- ret = iov_iter_get_pages_alloc(to, &pages, len,
- &page_off);
- if (ret <= 0)
- return -ENOMEM;
- num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+ u64 i_size;
+ bool more;
+
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, off, &len, 0, 1,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ more = len < iov_iter_count(to);
- ret = striped_read(inode, off, ret, pages, num_pages,
- page_off, checkeof);
- if (ret > 0) {
- iov_iter_advance(to, ret);
- off += ret;
+ if (unlikely(iov_iter_is_pipe(to))) {
+ ret = iov_iter_get_pages_alloc(to, &pages, len,
+ &page_off);
+ if (ret <= 0) {
+ ceph_osdc_put_request(req);
+ ret = -ENOMEM;
+ break;
+ }
+ num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+ if (ret < len) {
+ len = ret;
+ osd_req_op_extent_update(req, 0, len);
+ more = false;
+ }
} else {
- iov_iter_advance(to, 0);
+ num_pages = calc_pages_for(off, len);
+ page_off = off & ~PAGE_MASK;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
+ break;
+ }
}
- ceph_put_page_vector(pages, num_pages, false);
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- ret = striped_read(inode, off, len, pages, num_pages,
- (off & ~PAGE_MASK), checkeof);
- if (ret > 0) {
- int l, k = 0;
- size_t left = ret;
-
- while (left) {
- size_t page_off = off & ~PAGE_MASK;
- size_t copy = min_t(size_t, left,
- PAGE_SIZE - page_off);
- l = copy_page_to_iter(pages[k++], page_off,
- copy, to);
- off += l;
- left -= l;
- if (l < copy)
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+ false, false);
+ ret = ceph_osdc_start_request(osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_put_request(req);
+
+ i_size = i_size_read(inode);
+ dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
+ off, len, ret, i_size, (more ? " MORE" : ""));
+
+ if (ret == -ENOENT)
+ ret = 0;
+ if (ret >= 0 && ret < len && (off + ret < i_size)) {
+ int zlen = min(len - ret, i_size - off - ret);
+ int zoff = page_off + ret;
+ dout("sync_read zero gap %llu~%llu\n",
+ off + ret, off + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
+ }
+
+ if (unlikely(iov_iter_is_pipe(to))) {
+ if (ret > 0) {
+ iov_iter_advance(to, ret);
+ off += ret;
+ } else {
+ iov_iter_advance(to, 0);
+ }
+ ceph_put_page_vector(pages, num_pages, false);
+ } else {
+ int idx = 0;
+ size_t left = ret > 0 ? ret : 0;
+ while (left > 0) {
+ size_t len, copied;
+ page_off = off & ~PAGE_MASK;
+ len = min_t(size_t, left, PAGE_SIZE - page_off);
+ copied = copy_page_to_iter(pages[idx++],
+ page_off, len, to);
+ off += copied;
+ left -= copied;
+ if (copied < len) {
+ ret = -EFAULT;
break;
+ }
}
+ ceph_release_page_vector(pages, num_pages);
}
- ceph_release_page_vector(pages, num_pages);
+
+ if (ret <= 0 || off >= i_size || !more)
+ break;
}
if (off > iocb->ki_pos) {
+ if (ret >= 0 &&
+ iov_iter_count(to) > 0 && off >= i_size_read(inode))
+ *retry_op = CHECK_EOF;
ret = off - iocb->ki_pos;
iocb->ki_pos = off;
}
- dout("sync_read result %zd\n", ret);
+ dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
@@ -865,7 +859,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
}
spin_unlock(&ci->i_ceph_lock);
- req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
false, GFP_NOFS);
if (!req) {
ret = -ENOMEM;
@@ -877,6 +871,11 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
+ req->r_ops[0] = orig_req->r_ops[0];
+
+ req->r_mtime = aio_req->mtime;
+ req->r_data_offset = req->r_ops[0].extent.offset;
+
ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
if (ret) {
ceph_osdc_put_request(req);
@@ -884,11 +883,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
goto out;
}
- req->r_ops[0] = orig_req->r_ops[0];
-
- req->r_mtime = aio_req->mtime;
- req->r_data_offset = req->r_ops[0].extent.offset;
-
ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req;
@@ -1734,7 +1728,6 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
@@ -1742,10 +1735,7 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t endoff = 0;
loff_t size;
- if ((offset + length) > max(i_size_read(inode), fsc->max_file_size))
- return -EFBIG;
-
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!S_ISREG(inode->i_mode))
@@ -1762,18 +1752,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
- ceph_quota_is_max_bytes_exceeded(inode, offset + length)) {
- ret = -EDQUOT;
- goto unlock;
- }
-
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) &&
- !(mode & FALLOC_FL_PUNCH_HOLE)) {
- ret = -ENOSPC;
- goto unlock;
- }
-
if (ci->i_inline_version != CEPH_INLINE_NONE) {
ret = ceph_uninline_data(file, NULL);
if (ret < 0)
@@ -1781,12 +1759,12 @@ static long ceph_fallocate(struct file *file, int mode,
}
size = i_size_read(inode);
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- endoff = offset + length;
- ret = inode_newsize_ok(inode, endoff);
- if (ret)
- goto unlock;
- }
+
+ /* Are we punching a hole beyond EOF? */
+ if (offset >= size)
+ goto unlock;
+ if ((offset + length) > size)
+ length = size - offset;
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
@@ -1797,16 +1775,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (ret < 0)
goto unlock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- if (offset < size)
- ceph_zero_pagecache_range(inode, offset, length);
- ret = ceph_zero_objects(inode, offset, length);
- } else if (endoff > size) {
- truncate_pagecache_range(inode, size, -1);
- if (ceph_inode_set_size(inode, endoff))
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY, NULL);
- }
+ ceph_zero_pagecache_range(inode, offset, length);
+ ret = ceph_zero_objects(inode, offset, length);
if (!ret) {
spin_lock(&ci->i_ceph_lock);
@@ -1816,9 +1786,6 @@ static long ceph_fallocate(struct file *file, int mode,
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
- if ((endoff > size) &&
- ceph_quota_is_max_bytes_approaching(inode, endoff))
- ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
}
ceph_put_cap_refs(ci, got);
@@ -1828,6 +1795,300 @@ unlock:
return ret;
}
+/*
+ * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
+ * src_ci. Two attempts are made to obtain both caps, and an error is return if
+ * this fails; zero is returned on success.
+ */
+static int get_rd_wr_caps(struct ceph_inode_info *src_ci,
+ loff_t src_endoff, int *src_got,
+ struct ceph_inode_info *dst_ci,
+ loff_t dst_endoff, int *dst_got)
+{
+ int ret = 0;
+ bool retrying = false;
+
+retry_caps:
+ ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+ dst_endoff, dst_got, NULL);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Since we're already holding the FILE_WR capability for the dst file,
+ * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
+ * retry dance instead to try to get both capabilities.
+ */
+ ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
+ false, src_got);
+ if (ret <= 0) {
+ /* Start by dropping dst_ci caps and getting src_ci caps */
+ ceph_put_cap_refs(dst_ci, *dst_got);
+ if (retrying) {
+ if (!ret)
+ /* ceph_try_get_caps masks EAGAIN */
+ ret = -EAGAIN;
+ return ret;
+ }
+ ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD,
+ CEPH_CAP_FILE_SHARED, src_endoff,
+ src_got, NULL);
+ if (ret < 0)
+ return ret;
+ /*... drop src_ci caps too, and retry */
+ ceph_put_cap_refs(src_ci, *src_got);
+ retrying = true;
+ goto retry_caps;
+ }
+ return ret;
+}
+
+static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
+ struct ceph_inode_info *dst_ci, int dst_got)
+{
+ ceph_put_cap_refs(src_ci, src_got);
+ ceph_put_cap_refs(dst_ci, dst_got);
+}
+
+/*
+ * This function does several size-related checks, returning an error if:
+ * - source file is smaller than off+len
+ * - destination file size is not OK (inode_newsize_ok())
+ * - max bytes quotas is exceeded
+ */
+static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
+ loff_t src_off, loff_t dst_off, size_t len)
+{
+ loff_t size, endoff;
+
+ size = i_size_read(src_inode);
+ /*
+ * Don't copy beyond source file EOF. Instead of simply setting length
+ * to (size - src_off), just drop to VFS default implementation, as the
+ * local i_size may be stale due to other clients writing to the source
+ * inode.
+ */
+ if (src_off + len > size) {
+ dout("Copy beyond EOF (%llu + %zu > %llu)\n",
+ src_off, len, size);
+ return -EOPNOTSUPP;
+ }
+ size = i_size_read(dst_inode);
+
+ endoff = dst_off + len;
+ if (inode_newsize_ok(dst_inode, endoff))
+ return -EOPNOTSUPP;
+
+ if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
+ return -EDQUOT;
+
+ return 0;
+}
+
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ struct ceph_inode_info *src_ci = ceph_inode(src_inode);
+ struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
+ struct ceph_cap_flush *prealloc_cf;
+ struct ceph_object_locator src_oloc, dst_oloc;
+ struct ceph_object_id src_oid, dst_oid;
+ loff_t endoff = 0, size;
+ ssize_t ret = -EIO;
+ u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
+ u32 src_objlen, dst_objlen, object_size;
+ int src_got = 0, dst_got = 0, err, dirty;
+ bool do_final_copy = false;
+
+ if (src_inode == dst_inode)
+ return -EINVAL;
+ if (ceph_snap(dst_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ /*
+ * Some of the checks below will return -EOPNOTSUPP, which will force a
+ * fallback to the default VFS copy_file_range implementation. This is
+ * desirable in several cases (for ex, the 'len' is smaller than the
+ * size of the objects, or in cases where that would be more
+ * efficient).
+ */
+
+ if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM))
+ return -EOPNOTSUPP;
+
+ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
+ (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size))
+ return -EOPNOTSUPP;
+
+ if (len < src_ci->i_layout.object_size)
+ return -EOPNOTSUPP; /* no remote copy will be done */
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ /* Start by sync'ing the source file */
+ ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
+ * clients may have dirty data in their caches. And OSDs know nothing
+ * about caps, so they can't safely do the remote object copies.
+ */
+ err = get_rd_wr_caps(src_ci, (src_off + len), &src_got,
+ dst_ci, (dst_off + len), &dst_got);
+ if (err < 0) {
+ dout("get_rd_wr_caps returned %d\n", err);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
+ if (ret < 0)
+ goto out_caps;
+
+ size = i_size_read(dst_inode);
+ endoff = dst_off + len;
+
+ /* Drop dst file cached pages */
+ ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+ dst_off >> PAGE_SHIFT,
+ endoff >> PAGE_SHIFT);
+ if (ret < 0) {
+ dout("Failed to invalidate inode pages (%zd)\n", ret);
+ ret = 0; /* XXX */
+ }
+ src_oloc.pool = src_ci->i_layout.pool_id;
+ src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+ dst_oloc.pool = dst_ci->i_layout.pool_id;
+ dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ src_ci->i_layout.object_size,
+ &src_objnum, &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ dst_ci->i_layout.object_size,
+ &dst_objnum, &dst_objoff, &dst_objlen);
+ /* object-level offsets need to the same */
+ if (src_objoff != dst_objoff) {
+ ret = -EOPNOTSUPP;
+ goto out_caps;
+ }
+
+ /*
+ * Do a manual copy if the object offset isn't object aligned.
+ * 'src_objlen' contains the bytes left until the end of the object,
+ * starting at the src_off
+ */
+ if (src_objoff) {
+ /*
+ * we need to temporarily drop all caps as we'll be calling
+ * {read,write}_iter, which will get caps again.
+ */
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+ ret = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, src_objlen, flags);
+ if (ret < 0) {
+ dout("do_splice_direct returned %d\n", err);
+ goto out;
+ }
+ len -= ret;
+ err = get_rd_wr_caps(src_ci, (src_off + len),
+ &src_got, dst_ci,
+ (dst_off + len), &dst_got);
+ if (err < 0)
+ goto out;
+ err = is_file_size_ok(src_inode, dst_inode,
+ src_off, dst_off, len);
+ if (err < 0)
+ goto out_caps;
+ }
+ object_size = src_ci->i_layout.object_size;
+ while (len >= object_size) {
+ ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
+ object_size, &src_objnum,
+ &src_objoff, &src_objlen);
+ ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
+ object_size, &dst_objnum,
+ &dst_objoff, &dst_objlen);
+ ceph_oid_init(&src_oid);
+ ceph_oid_printf(&src_oid, "%llx.%08llx",
+ src_ci->i_vino.ino, src_objnum);
+ ceph_oid_init(&dst_oid);
+ ceph_oid_printf(&dst_oid, "%llx.%08llx",
+ dst_ci->i_vino.ino, dst_objnum);
+ /* Do an object remote copy */
+ err = ceph_osdc_copy_from(
+ &ceph_inode_to_client(src_inode)->client->osdc,
+ src_ci->i_vino.snap, 0,
+ &src_oid, &src_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+ &dst_oid, &dst_oloc,
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
+ if (err) {
+ dout("ceph_osdc_copy_from returned %d\n", err);
+ if (!ret)
+ ret = err;
+ goto out_caps;
+ }
+ len -= object_size;
+ src_off += object_size;
+ dst_off += object_size;
+ ret += object_size;
+ }
+
+ if (len)
+ /* We still need one final local copy */
+ do_final_copy = true;
+
+ file_update_time(dst_file);
+ if (endoff > size) {
+ int caps_flags = 0;
+
+ /* Let the MDS know about dst file size change */
+ if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
+ caps_flags |= CHECK_CAPS_NODELAY;
+ if (ceph_inode_set_size(dst_inode, endoff))
+ caps_flags |= CHECK_CAPS_AUTHONLY;
+ if (caps_flags)
+ ceph_check_caps(dst_ci, caps_flags, NULL);
+ }
+ /* Mark Fw dirty */
+ spin_lock(&dst_ci->i_ceph_lock);
+ dst_ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&dst_ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(dst_inode, dirty);
+
+out_caps:
+ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
+
+ if (do_final_copy) {
+ err = do_splice_direct(src_file, &src_off, dst_file,
+ &dst_off, len, flags);
+ if (err < 0) {
+ dout("do_splice_direct returned %d\n", err);
+ goto out;
+ }
+ len -= err;
+ ret += err;
+ }
+
+out:
+ ceph_free_cap_flush(prealloc_cf);
+
+ return ret;
+}
+
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
@@ -1843,5 +2104,5 @@ const struct file_operations ceph_file_fops = {
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
.fallocate = ceph_fallocate,
+ .copy_file_range = ceph_copy_file_range,
};
-
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ebc7bdaed2d0..79dd5e6ed755 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1132,8 +1132,12 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
- dput(dn);
- dn = realdn; /* note realdn contains the error */
+ dn = realdn;
+ /*
+ * Caller should release 'dn' in the case of error.
+ * If 'req->r_dentry' is passed to this function,
+ * caller should leave 'req->r_dentry' untouched.
+ */
goto out;
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
@@ -1196,7 +1200,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
- if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
+ !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct qstr dname;
struct dentry *dn, *parent;
@@ -1677,7 +1683,6 @@ retry_lookup:
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
d_drop(dn);
- dn = NULL;
goto next_item;
}
dn = realdn;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bc43c822426a..67a9aeb2f4ec 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2071,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
+ msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
@@ -2136,7 +2136,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
- refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
@@ -3126,12 +3125,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
pr_info("mds%d reconnect start\n", mds);
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
goto fail_nopagelist;
- ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
+ reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
@@ -3241,6 +3239,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
+ ceph_pagelist_release(pagelist);
return;
fail:
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index eab1359d0553..b5ecd6f50360 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -165,6 +165,8 @@ enum {
Opt_noacl,
Opt_quotadf,
Opt_noquotadf,
+ Opt_copyfrom,
+ Opt_nocopyfrom,
};
static match_table_t fsopt_tokens = {
@@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = {
{Opt_noacl, "noacl"},
{Opt_quotadf, "quotadf"},
{Opt_noquotadf, "noquotadf"},
+ {Opt_copyfrom, "copyfrom"},
+ {Opt_nocopyfrom, "nocopyfrom"},
{-1, NULL}
};
@@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noquotadf:
fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
break;
+ case Opt_copyfrom:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
+ case Opt_nocopyfrom:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
+ break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= SB_POSIXACL;
@@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM)
+ seq_puts(m, ",nocopyfrom");
+
if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 582e28fd1b7b..c005a5400f2e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -40,6 +40,7 @@
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
+#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
@@ -1008,7 +1009,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
extern int ceph_try_get_caps(struct ceph_inode_info *ci,
- int need, int want, int *got);
+ int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5cc8b94f8206..316f6ad10644 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (size > 0) {
/* copy value into pagelist */
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
err = ceph_pagelist_append(pagelist, value, size);
if (err)
goto out;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f1fbea947fef..3e812428ac8d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -132,7 +132,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
- seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed);
+ seq_printf(m, "\tSpeed: %zu bps\n", iface->speed);
seq_puts(m, "\t\tCapabilities: ");
if (iface->rdma_capable)
seq_puts(m, "rdma ");
@@ -285,7 +285,7 @@ skip_rdma:
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
(ses->serverNOS == NULL)) {
- seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t",
+ seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
i, ses->serverName, ses->ses_count,
ses->capabilities, ses->status);
if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST)
@@ -296,16 +296,18 @@ skip_rdma:
seq_printf(m,
"\n%d) Name: %s Domain: %s Uses: %d OS:"
" %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
- " session status: %d\t",
+ " session status: %d ",
i, ses->serverName, ses->serverDomain,
ses->ses_count, ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
}
if (server->rdma)
seq_printf(m, "RDMA\n\t");
- seq_printf(m, "TCP status: %d\n\tLocal Users To "
+ seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To "
"Server: %d SecMode: 0x%x Req On Wire: %d",
- server->tcpStatus, server->srv_count,
+ server->tcpStatus,
+ server->reconnect_instance,
+ server->srv_count,
server->sec_mode, in_flight(server));
#ifdef CONFIG_CIFS_STATS2
@@ -352,7 +354,7 @@ skip_rdma:
seq_printf(m, "\n\tServer interfaces: %zu\n",
ses->iface_count);
for (j = 0; j < ses->iface_count; j++) {
- seq_printf(m, "\t%d)\n", j);
+ seq_printf(m, "\t%d)", j);
cifs_dump_iface(m, &ses->iface_list[j]);
}
spin_unlock(&ses->iface_lock);
@@ -383,6 +385,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
#endif /* CONFIG_CIFS_STATS2 */
+ atomic_set(&tcpSesReconnectCount, 0);
+ atomic_set(&tconInfoReconnectCount, 0);
+
spin_lock(&GlobalMid_Lock);
GlobalMaxActiveXid = 0;
GlobalCurrentXid = 0;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index f4f3f0853c6e..631dc1bb21c1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -47,6 +47,29 @@ extern int cifsFYI;
*/
#ifdef CONFIG_CIFS_DEBUG
+
+/*
+ * When adding tracepoints and debug messages we have various choices.
+ * Some considerations:
+ *
+ * Use cifs_dbg(VFS, ...) for things we always want logged, and the user to see
+ * cifs_info(...) slightly less important, admin can filter via loglevel > 6
+ * cifs_dbg(FYI, ...) minor debugging messages, off by default
+ * trace_smb3_* ftrace functions are preferred for complex debug messages
+ * intended for developers or experienced admins, off by default
+ */
+
+/* Information level messages, minor events */
+#define cifs_info_func(ratefunc, fmt, ...) \
+do { \
+ pr_info_ ## ratefunc("CIFS: " fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define cifs_info(fmt, ...) \
+do { \
+ cifs_info_func(ratelimited, fmt, ##__VA_ARGS__); \
+} while (0)
+
/* information message: e.g., configuration, major event */
#define cifs_dbg_func(ratefunc, type, fmt, ...) \
do { \
@@ -81,6 +104,11 @@ do { \
if (0) \
pr_debug(fmt, ##__VA_ARGS__); \
} while (0)
+
+#define cifs_info(fmt, ...) \
+do { \
+ pr_info("CIFS: "fmt, ##__VA_ARGS__); \
+} while (0)
#endif
#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 6b61df117fd4..b97c74efd04a 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -304,12 +304,17 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
*/
mnt = ERR_PTR(-ENOMEM);
+ cifs_sb = CIFS_SB(mntpt->d_sb);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) {
+ mnt = ERR_PTR(-EREMOTE);
+ goto cdda_exit;
+ }
+
/* always use tree name prefix */
full_path = build_path_from_dentry_optional_prefix(mntpt, true);
if (full_path == NULL)
goto cdda_exit;
- cifs_sb = CIFS_SB(mntpt->d_sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
mnt = ERR_CAST(tlink);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9731d0d891e7..63d7530f2e1d 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -51,6 +51,7 @@
*/
#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */
#define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */
+#define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */
struct cifs_sb_info {
struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 57ff0756e30c..d8bce2f862de 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -43,8 +43,19 @@ struct smb_snapshot_array {
/* snapshots[]; */
} __packed;
+struct smb_query_info {
+ __u32 info_type;
+ __u32 file_info_class;
+ __u32 additional_information;
+ __u32 flags;
+ __u32 input_buffer_length;
+ __u32 output_buffer_length;
+ /* char buffer[]; */
+} __packed;
+
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array)
+#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 7065426b3280..7de9603c54f1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -81,6 +81,14 @@ module_param(cifs_max_pending, uint, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
"CIFS/SMB1 dialect (N/A for SMB3) "
"Default: 32767 Range: 2 to 32767.");
+#ifdef CONFIG_CIFS_STATS2
+unsigned int slow_rsp_threshold = 1;
+module_param(slow_rsp_threshold, uint, 0644);
+MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
+ "before logging that a response is delayed. "
+ "Default: 1 (if set to 0 disables msg).");
+#endif /* STATS2 */
+
module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
@@ -492,6 +500,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",unix");
else
seq_puts(s, ",nounix");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ seq_puts(s, ",nodfs");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -707,7 +717,14 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
struct cifs_mnt_data mnt_data;
struct dentry *root;
- cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+ /*
+ * Prints in Kernel / CIFS log the attempted mount operation
+ * If CIFS_DEBUG && cifs_FYI
+ */
+ if (cifsFYI)
+ cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+ else
+ cifs_info("Attempting to mount %s\n", dev_name);
volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3);
if (IS_ERR(volume_info))
@@ -1418,6 +1435,11 @@ init_cifs(void)
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
+ if (slow_rsp_threshold < 1)
+ cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
+ else if (slow_rsp_threshold > 32767)
+ cifs_dbg(VFS,
+ "slow response threshold set higher than recommended (0 to 32767)\n");
#endif /* CONFIG_CIFS_STATS2 */
atomic_set(&midCount, 0);
@@ -1538,11 +1560,11 @@ exit_cifs(void)
cifs_proc_clean();
}
-MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
+MODULE_AUTHOR("Steve French");
MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */
MODULE_DESCRIPTION
- ("VFS to access servers complying with the SNIA CIFS Specification "
- "e.g. Samba and Windows");
+ ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
+ "also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
MODULE_SOFTDEP("pre: arc4");
MODULE_SOFTDEP("pre: des");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f047e87871a1..24e265a51874 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -148,5 +148,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.13"
+#define CIFS_VERSION "2.14"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9dcaed031843..ed1e0fcb69e3 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -33,6 +33,7 @@
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
+#define SMB_PATH_MAX 260
#define CIFS_PORT 445
#define RFC1001_PORT 139
@@ -465,6 +466,11 @@ struct smb_version_operations {
enum securityEnum (*select_sectype)(struct TCP_Server_Info *,
enum securityEnum);
int (*next_header)(char *);
+ /* ioctl passthrough for query_info */
+ int (*ioctl_query_info)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ __le16 *path, int is_dir,
+ unsigned long p);
};
struct smb_version_values {
@@ -654,6 +660,7 @@ struct TCP_Server_Info {
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
+ __u32 reconnect_instance; /* incremented on each reconnect */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
@@ -798,6 +805,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
* a single wsize request with a single call.
*/
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
/*
* Windows only supports a max of 60kb reads and 65535 byte writes. Default to
@@ -924,6 +932,8 @@ struct cifs_tcon {
struct list_head tcon_list;
int tc_count;
struct list_head rlist; /* reconnect list */
+ atomic_t num_local_opens; /* num of all opens including disconnected */
+ atomic_t num_remote_opens; /* num of all network opens on server */
struct list_head openFileList;
spinlock_t open_file_lock; /* protects list above */
struct cifs_ses *ses; /* pointer to session associated with */
@@ -1072,7 +1082,8 @@ struct cifsLockInfo {
__u64 offset;
__u64 length;
__u32 pid;
- __u32 type;
+ __u16 type;
+ __u16 flags;
};
/*
@@ -1715,6 +1726,7 @@ GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */
#ifdef CONFIG_CIFS_STATS2
GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+extern unsigned int slow_rsp_threshold; /* number of secs before logging */
#endif
GLOBAL_EXTERN atomic_t smBufAllocCount;
GLOBAL_EXTERN atomic_t midCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 20adda4de83b..fa361bc00602 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -219,7 +219,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon);
extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
- __u64 length, __u8 type,
+ __u64 length, __u8 type, __u16 flags,
struct cifsLockInfo **conf_lock,
int rw_check);
extern void cifs_add_pending_open(struct cifs_fid *fid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5657b79dbc99..f82fd342bca5 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1607,6 +1607,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_pages = rdata->pages,
+ .rq_offset = rdata->page_offset,
.rq_npages = rdata->nr_pages,
.rq_pagesz = rdata->pagesz,
.rq_tailsz = rdata->tailsz };
@@ -2210,6 +2211,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
rqst.rq_pages = wdata->pages;
+ rqst.rq_offset = wdata->page_offset;
rqst.rq_npages = wdata->nr_pages;
rqst.rq_pagesz = wdata->pagesz;
rqst.rq_tailsz = wdata->tailsz;
@@ -5027,6 +5029,13 @@ oldQFSInfoRetry:
le16_to_cpu(response_data->BytesPerSector) *
le32_to_cpu(response_data->
SectorsPerAllocationUnit);
+ /*
+ * much prefer larger but if server doesn't report
+ * a valid size than 4K is a reasonable minimum
+ */
+ if (FSData->f_bsize < 512)
+ FSData->f_bsize = 4096;
+
FSData->f_blocks =
le32_to_cpu(response_data->TotalAllocationUnits);
FSData->f_bfree = FSData->f_bavail =
@@ -5107,6 +5116,13 @@ QFSInfoRetry:
le32_to_cpu(response_data->BytesPerSector) *
le32_to_cpu(response_data->
SectorsPerAllocationUnit);
+ /*
+ * much prefer larger but if server doesn't report
+ * a valid size than 4K is a reasonable minimum
+ */
+ if (FSData->f_bsize < 512)
+ FSData->f_bsize = 4096;
+
FSData->f_blocks =
le64_to_cpu(response_data->TotalAllocationUnits);
FSData->f_bfree = FSData->f_bavail =
@@ -5470,6 +5486,13 @@ QFSPosixRetry:
data_offset);
FSData->f_bsize =
le32_to_cpu(response_data->BlockSize);
+ /*
+ * much prefer larger but if server doesn't report
+ * a valid size than 4K is a reasonable minimum
+ */
+ if (FSData->f_bsize < 512)
+ FSData->f_bsize = 4096;
+
FSData->f_blocks =
le64_to_cpu(response_data->TotalBlocks);
FSData->f_bfree =
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 11bcd2fb90b1..6f24f129a751 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -250,6 +250,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_ignore, "dev" },
{ Opt_ignore, "mand" },
{ Opt_ignore, "nomand" },
+ { Opt_ignore, "relatime" },
{ Opt_ignore, "_netdev" },
{ Opt_err, NULL }
@@ -347,7 +348,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->maxBuf = 0;
server->max_read = 0;
- cifs_dbg(FYI, "Reconnecting tcp session\n");
+ cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
trace_smb3_reconnect(server->CurrentMid, server->hostname);
/* before reconnecting the tcp session, mark the smb session (uid)
@@ -2396,6 +2397,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
+ tcp_ses->reconnect_instance = 0;
tcp_ses->lstrp = jiffies;
spin_lock_init(&tcp_ses->req_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
@@ -3085,10 +3087,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
if (rc)
goto out_fail;
- if (volume_info->nodfs) {
- tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
- cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
- }
tcon->use_persistent = false;
/* check if SMB2 or later, CIFS does not support persistent handles */
if (volume_info->persistent) {
@@ -3663,6 +3661,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->actimeo = pvolume_info->actimeo;
cifs_sb->local_nls = pvolume_info->local_nls;
+ if (pvolume_info->nodfs)
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS;
if (pvolume_info->noperm)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
if (pvolume_info->setuids)
@@ -3819,6 +3819,9 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
struct dfs_info3_param *referrals = NULL;
char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ return -EREMOTE;
+
full_path = build_unc_path_to_root(volume_info, cifs_sb);
if (IS_ERR(full_path))
return PTR_ERR(full_path);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index dcdbcb6f09f8..e262a05a98bf 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -334,6 +334,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
server->ops->set_fid(cfile, fid, oplock);
list_add(&cfile->tlist, &tcon->openFileList);
+ atomic_inc(&tcon->num_local_opens);
/* if readable file instance put first in list*/
if (file->f_mode & FMODE_READ)
@@ -395,6 +396,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
/* remove it from the lists */
list_del(&cifs_file->flist);
list_del(&cifs_file->tlist);
+ atomic_dec(&tcon->num_local_opens);
if (list_empty(&cifsi->openFileList)) {
cifs_dbg(FYI, "closing last open instance for inode %p\n",
@@ -864,7 +866,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
}
static struct cifsLockInfo *
-cifs_lock_init(__u64 offset, __u64 length, __u8 type)
+cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 flags)
{
struct cifsLockInfo *lock =
kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL);
@@ -874,6 +876,7 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type)
lock->length = length;
lock->type = type;
lock->pid = current->tgid;
+ lock->flags = flags;
INIT_LIST_HEAD(&lock->blist);
init_waitqueue_head(&lock->block_q);
return lock;
@@ -896,7 +899,8 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
/* @rw_check : 0 - no op, 1 - read, 2 - write */
static bool
cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
- __u64 length, __u8 type, struct cifsFileInfo *cfile,
+ __u64 length, __u8 type, __u16 flags,
+ struct cifsFileInfo *cfile,
struct cifsLockInfo **conf_lock, int rw_check)
{
struct cifsLockInfo *li;
@@ -918,6 +922,10 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
((server->ops->compare_fids(cfile, cur_cfile) &&
current->tgid == li->pid) || type == li->type))
continue;
+ if (rw_check == CIFS_LOCK_OP &&
+ (flags & FL_OFDLCK) && (li->flags & FL_OFDLCK) &&
+ server->ops->compare_fids(cfile, cur_cfile))
+ continue;
if (conf_lock)
*conf_lock = li;
return true;
@@ -927,8 +935,8 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
bool
cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
- __u8 type, struct cifsLockInfo **conf_lock,
- int rw_check)
+ __u8 type, __u16 flags,
+ struct cifsLockInfo **conf_lock, int rw_check)
{
bool rc = false;
struct cifs_fid_locks *cur;
@@ -936,7 +944,8 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
list_for_each_entry(cur, &cinode->llist, llist) {
rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
- cfile, conf_lock, rw_check);
+ flags, cfile, conf_lock,
+ rw_check);
if (rc)
break;
}
@@ -964,7 +973,8 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
down_read(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, offset, length, type,
- &conf_lock, CIFS_LOCK_OP);
+ flock->fl_flags, &conf_lock,
+ CIFS_LOCK_OP);
if (exist) {
flock->fl_start = conf_lock->offset;
flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -1011,7 +1021,8 @@ try_again:
down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
- lock->type, &conf_lock, CIFS_LOCK_OP);
+ lock->type, lock->flags, &conf_lock,
+ CIFS_LOCK_OP);
if (!exist && cinode->can_cache_brlcks) {
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
@@ -1321,7 +1332,7 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
cifs_dbg(FYI, "Lease on file - not implemented yet\n");
if (flock->fl_flags &
(~(FL_POSIX | FL_FLOCK | FL_SLEEP |
- FL_ACCESS | FL_LEASE | FL_CLOSE)))
+ FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK)))
cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
*type = server->vals->large_lock_type;
@@ -1584,7 +1595,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
if (lock) {
struct cifsLockInfo *lock;
- lock = cifs_lock_init(flock->fl_start, length, type);
+ lock = cifs_lock_init(flock->fl_start, length, type,
+ flock->fl_flags);
if (!lock)
return -ENOMEM;
@@ -1653,7 +1665,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag,
tcon->ses->server);
-
cifs_sb = CIFS_FILE_SB(file);
netfid = cfile->fid.netfid;
cinode = CIFS_I(file_inode(file));
@@ -2098,6 +2109,7 @@ static int cifs_writepages(struct address_space *mapping,
pgoff_t end, index;
struct cifs_writedata *wdata;
int rc = 0;
+ unsigned int xid;
/*
* If wsize is smaller than the page cache size, default to writing
@@ -2106,6 +2118,7 @@ static int cifs_writepages(struct address_space *mapping,
if (cifs_sb->wsize < PAGE_SIZE)
return generic_writepages(mapping, wbc);
+ xid = get_xid();
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
@@ -2199,6 +2212,7 @@ retry:
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
+ free_xid(xid);
return rc;
}
@@ -2817,8 +2831,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
goto out;
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
- server->vals->exclusive_lock_type, NULL,
- CIFS_WRITE_OP))
+ server->vals->exclusive_lock_type, 0,
+ NULL, CIFS_WRITE_OP))
rc = __generic_file_write_iter(iocb, from);
else
rc = -EACCES;
@@ -3388,7 +3402,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
tcon->ses->server->vals->shared_lock_type,
- NULL, CIFS_READ_OP))
+ 0, NULL, CIFS_READ_OP))
rc = generic_file_read_iter(iocb, to);
up_read(&cinode->lock_sem);
return rc;
@@ -3743,7 +3757,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct TCP_Server_Info *server;
pid_t pid;
+ unsigned int xid;
+ xid = get_xid();
/*
* Reads as many pages as possible from fscache. Returns -ENOBUFS
* immediately if the cookie is negative
@@ -3753,8 +3769,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
*/
rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
&num_pages);
- if (rc == 0)
+ if (rc == 0) {
+ free_xid(xid);
return rc;
+ }
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -3798,6 +3816,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
*/
if (unlikely(rsize < PAGE_SIZE)) {
add_credits_and_wake_if(server, credits, 0);
+ free_xid(xid);
return 0;
}
@@ -3862,6 +3881,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* allocator.
*/
cifs_fscache_readpages_cancel(mapping->host, page_list);
+ free_xid(xid);
return rc;
}
@@ -3889,8 +3909,12 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
else
cifs_dbg(FYI, "Bytes read %d\n", rc);
- file_inode(file)->i_atime =
- current_time(file_inode(file));
+ /* we do not want atime to be less than mtime, it broke some apps */
+ file_inode(file)->i_atime = current_time(file_inode(file));
+ if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
+ file_inode(file)->i_atime = file_inode(file)->i_mtime;
+ else
+ file_inode(file)->i_atime = current_time(file_inode(file));
if (PAGE_SIZE > rc)
memset(read_data + rc, 0, PAGE_SIZE - rc);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6e8765f44508..1023d78673fb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -162,7 +162,11 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
cifs_revalidate_cache(inode, fattr);
spin_lock(&inode->i_lock);
- inode->i_atime = fattr->cf_atime;
+ /* we do not want atime to be less than mtime, it broke some apps */
+ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime))
+ inode->i_atime = fattr->cf_mtime;
+ else
+ inode->i_atime = fattr->cf_atime;
inode->i_mtime = fattr->cf_mtime;
inode->i_ctime = fattr->cf_ctime;
inode->i_rdev = fattr->cf_rdev;
@@ -777,38 +781,53 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, sb);
rc = 0;
- } else if (rc == -EACCES && backup_cred(cifs_sb)) {
- srchinf = kzalloc(sizeof(struct cifs_search_info),
- GFP_KERNEL);
- if (srchinf == NULL) {
- rc = -ENOMEM;
- goto cgii_exit;
- }
+ } else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
+ (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
+ == 0)) {
+ /*
+ * For SMB2 and later the backup intent flag is already
+ * sent if needed on open and there is no path based
+ * FindFirst operation to use to retry with
+ */
- srchinf->endOfSearch = false;
+ srchinf = kzalloc(sizeof(struct cifs_search_info),
+ GFP_KERNEL);
+ if (srchinf == NULL) {
+ rc = -ENOMEM;
+ goto cgii_exit;
+ }
+
+ srchinf->endOfSearch = false;
+ if (tcon->unix_ext)
+ srchinf->info_level = SMB_FIND_FILE_UNIX;
+ else if ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0)
+ srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+ else /* no srvino useful for fallback to some netapp */
+ srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
- srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
- CIFS_SEARCH_CLOSE_AT_END |
- CIFS_SEARCH_BACKUP_SEARCH;
+ srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+ CIFS_SEARCH_CLOSE_AT_END |
+ CIFS_SEARCH_BACKUP_SEARCH;
- rc = CIFSFindFirst(xid, tcon, full_path,
- cifs_sb, NULL, srchflgs, srchinf, false);
- if (!rc) {
- data =
- (FILE_ALL_INFO *)srchinf->srch_entries_start;
+ rc = CIFSFindFirst(xid, tcon, full_path,
+ cifs_sb, NULL, srchflgs, srchinf, false);
+ if (!rc) {
+ data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
- cifs_dir_info_to_fattr(&fattr,
- (FILE_DIRECTORY_INFO *)data, cifs_sb);
- fattr.cf_uniqueid = le64_to_cpu(
- ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
- validinum = true;
+ cifs_dir_info_to_fattr(&fattr,
+ (FILE_DIRECTORY_INFO *)data, cifs_sb);
+ fattr.cf_uniqueid = le64_to_cpu(
+ ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+ validinum = true;
- cifs_buf_release(srchinf->ntwrk_buf_start);
- }
- kfree(srchinf);
- if (rc)
- goto cgii_exit;
+ cifs_buf_release(srchinf->ntwrk_buf_start);
+ }
+ kfree(srchinf);
+ if (rc)
+ goto cgii_exit;
} else
goto cgii_exit;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 54f32f9143a9..76ddd98b6298 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -32,8 +32,51 @@
#include "cifs_debug.h"
#include "cifsfs.h"
#include "cifs_ioctl.h"
+#include "smb2proto.h"
#include <linux/btrfs.h>
+static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
+ unsigned long p)
+{
+ struct inode *inode = file_inode(filep);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct dentry *dentry = filep->f_path.dentry;
+ unsigned char *path;
+ __le16 *utf16_path = NULL, root_path;
+ int rc = 0;
+
+ path = build_path_from_dentry(dentry);
+ if (path == NULL)
+ return -ENOMEM;
+
+ cifs_dbg(FYI, "%s %s\n", __func__, path);
+
+ if (!path[0]) {
+ root_path = 0;
+ utf16_path = &root_path;
+ } else {
+ utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb);
+ if (!utf16_path) {
+ rc = -ENOMEM;
+ goto ici_exit;
+ }
+ }
+
+ if (tcon->ses->server->ops->ioctl_query_info)
+ rc = tcon->ses->server->ops->ioctl_query_info(
+ xid, tcon, utf16_path,
+ filep->private_data ? 0 : 1, p);
+ else
+ rc = -EOPNOTSUPP;
+
+ ici_exit:
+ if (utf16_path != &root_path)
+ kfree(utf16_path);
+ kfree(path);
+ return rc;
+}
+
static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
unsigned long srcfd)
{
@@ -123,7 +166,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
struct inode *inode = file_inode(filep);
int rc = -ENOTTY; /* strange error - but the precedent */
unsigned int xid;
- struct cifs_sb_info *cifs_sb;
struct cifsFileInfo *pSMBFile = filep->private_data;
struct cifs_tcon *tcon;
__u64 ExtAttrBits = 0;
@@ -131,7 +173,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
- cifs_sb = CIFS_SB(inode->i_sb);
cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
switch (command) {
case FS_IOC_GETFLAGS:
@@ -196,6 +237,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
case CIFS_IOC_COPYCHUNK_FILE:
rc = cifs_ioctl_copychunk(xid, filep, arg);
break;
+ case CIFS_QUERY_INFO:
+ rc = cifs_ioctl_query_info(xid, filep, arg);
+ break;
case CIFS_IOC_SET_INTEGRITY:
if (pSMBFile == NULL)
break;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 10cff44832d8..8a41f4eba726 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -123,6 +123,8 @@ tconInfoAlloc(void)
ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid),
GFP_KERNEL);
spin_lock_init(&ret_buf->stat_lock);
+ atomic_set(&ret_buf->num_local_opens, 0);
+ atomic_set(&ret_buf->num_remote_opens, 0);
}
return ret_buf;
}
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 0ffa18094335..dd10f0ce4cd5 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -33,7 +33,7 @@
/*
* Identifiers for functions that use the open, operation, close pattern
- * in smb2inode.c:smb2_open_op_close()
+ * in smb2inode.c:smb2_compound_op()
*/
#define SMB2_OP_SET_DELETE 1
#define SMB2_OP_SET_INFO 2
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1eef1791d0c4..9e7ef7ec2d70 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -38,54 +38,83 @@
#include "smb2proto.h"
static int
-smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition,
- __u32 create_options, void *data, int command)
+smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *full_path,
+ __u32 desired_access, __u32 create_disposition,
+ __u32 create_options, void *ptr, int command)
{
- int rc, tmprc = 0;
+ int rc;
__le16 *utf16_path = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
- bool use_cached_root_handle = false;
-
- if ((strcmp(full_path, "") == 0) && (create_options == 0) &&
- (desired_access == FILE_READ_ATTRIBUTES) &&
- (create_disposition == FILE_OPEN) &&
- (tcon->nohandlecache == false)) {
- rc = open_shroot(xid, tcon, &fid);
- if (rc == 0)
- use_cached_root_handle = true;
- }
+ struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = ses->server;
+ int num_rqst = 0;
+ struct smb_rqst rqst[3];
+ int resp_buftype[3];
+ struct kvec rsp_iov[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec close_iov[1];
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int flags = 0;
+ __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+ unsigned int size[2];
+ void *data[2];
+ struct smb2_file_rename_info rename_info;
+ struct smb2_file_link_info link_info;
+ int len;
- if (use_cached_root_handle == false) {
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- oparms.tcon = tcon;
- oparms.desired_access = desired_access;
- oparms.disposition = create_disposition;
- oparms.create_options = create_options;
- oparms.fid = &fid;
- oparms.reconnect = false;
-
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
- NULL);
- if (rc) {
- kfree(utf16_path);
- return rc;
- }
- }
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = desired_access;
+ oparms.disposition = create_disposition;
+ oparms.create_options = create_options;
+ if (backup_cred(cifs_sb))
+ oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[num_rqst].rq_iov = open_iov;
+ rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
+ rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &oparms,
+ utf16_path);
+ kfree(utf16_path);
+ if (rc)
+ goto finished;
+
+ smb2_set_next_command(server, &rqst[num_rqst++]);
+
+ /* Operation */
switch (command) {
- case SMB2_OP_DELETE:
- break;
case SMB2_OP_QUERY_INFO:
- tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid,
- (struct smb2_file_all_info *)data);
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[num_rqst].rq_iov = qi_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ smb2_set_next_command(server, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_DELETE:
break;
case SMB2_OP_MKDIR:
/*
@@ -94,39 +123,156 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
*/
break;
case SMB2_OP_RMDIR:
- tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid,
- fid.volatile_fid);
- break;
- case SMB2_OP_RENAME:
- tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, (__le16 *)data);
- break;
- case SMB2_OP_HARDLINK:
- tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, (__le16 *)data);
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = 8;
+ data[0] = &delete_pending[0];
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(server, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_EOF:
- tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
- fid.volatile_fid, current->tgid,
- (__le64 *)data, false);
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = 8; /* sizeof __le64 */
+ data[0] = ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(server, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
break;
case SMB2_OP_SET_INFO:
- tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
- fid.volatile_fid,
- (FILE_BASIC_INFO *)data);
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+
+ size[0] = sizeof(FILE_BASIC_INFO);
+ data[0] = ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_BASIC_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(server, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_RENAME:
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 2;
+
+ len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+
+ rename_info.ReplaceIfExists = 1;
+ rename_info.RootDirectory = 0;
+ rename_info.FileNameLength = cpu_to_le32(len);
+
+ size[0] = sizeof(struct smb2_file_rename_info);
+ data[0] = &rename_info;
+
+ size[1] = len + 2 /* null */;
+ data[1] = (__le16 *)ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_RENAME_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(server, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_HARDLINK:
+ memset(&si_iov, 0, sizeof(si_iov));
+ rqst[num_rqst].rq_iov = si_iov;
+ rqst[num_rqst].rq_nvec = 2;
+
+ len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+
+ link_info.ReplaceIfExists = 0;
+ link_info.RootDirectory = 0;
+ link_info.FileNameLength = cpu_to_le32(len);
+
+ size[0] = sizeof(struct smb2_file_link_info);
+ data[0] = &link_info;
+
+ size[1] = len + 2 /* null */;
+ data[1] = (__le16 *)ptr;
+
+ rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_LINK_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ smb2_set_next_command(server, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
break;
default:
cifs_dbg(VFS, "Invalid command\n");
- break;
+ rc = -EINVAL;
}
+ if (rc)
+ goto finished;
- if (use_cached_root_handle)
- close_shroot(&tcon->crfid);
- else
- rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- if (tmprc)
- rc = tmprc;
- kfree(utf16_path);
+ /* Close */
+ memset(&close_iov, 0, sizeof(close_iov));
+ rqst[num_rqst].rq_iov = close_iov;
+ rqst[num_rqst].rq_nvec = 1;
+ rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID);
+ smb2_set_related(&rqst[num_rqst++]);
+ if (rc)
+ goto finished;
+
+ rc = compound_send_recv(xid, ses, flags, num_rqst, rqst,
+ resp_buftype, rsp_iov);
+
+ finished:
+ SMB2_open_free(&rqst[0]);
+ switch (command) {
+ case SMB2_OP_QUERY_INFO:
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ ptr);
+ }
+ if (rqst[1].rq_iov)
+ SMB2_query_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+ break;
+ case SMB2_OP_DELETE:
+ case SMB2_OP_MKDIR:
+ if (rqst[1].rq_iov)
+ SMB2_close_free(&rqst[1]);
+ break;
+ case SMB2_OP_HARDLINK:
+ case SMB2_OP_RENAME:
+ case SMB2_OP_RMDIR:
+ case SMB2_OP_SET_EOF:
+ case SMB2_OP_SET_INFO:
+ if (rqst[1].rq_iov)
+ SMB2_set_info_free(&rqst[1]);
+ if (rqst[2].rq_iov)
+ SMB2_close_free(&rqst[2]);
+ break;
+ }
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
return rc;
}
@@ -147,6 +293,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
{
int rc;
struct smb2_file_all_info *smb2_data;
+ __u32 create_options = 0;
*adjust_tz = false;
*symlink = false;
@@ -155,17 +302,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
GFP_KERNEL);
if (smb2_data == NULL)
return -ENOMEM;
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
- smb2_data, SMB2_OP_QUERY_INFO);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
+ smb2_data, SMB2_OP_QUERY_INFO);
if (rc == -EOPNOTSUPP) {
*symlink = true;
+ create_options |= OPEN_REPARSE_POINT;
+
/* Failed on a symbolic link - query a reparse point info */
- rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- OPEN_REPARSE_POINT, smb2_data,
- SMB2_OP_QUERY_INFO);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, smb2_data,
+ SMB2_OP_QUERY_INFO);
}
if (rc)
goto out;
@@ -180,9 +331,9 @@ int
smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
+ return smb2_compound_op(xid, tcon, cifs_sb, name,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+ CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
}
void
@@ -199,9 +350,9 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
- tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
- FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
+ tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
+ FILE_WRITE_ATTRIBUTES, FILE_CREATE,
+ CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -210,18 +361,18 @@ int
smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE,
- NULL, SMB2_OP_RMDIR);
+ return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+ CREATE_NOT_FILE,
+ NULL, SMB2_OP_RMDIR);
}
int
smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
- return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- NULL, SMB2_OP_DELETE);
+ return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+ CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+ NULL, SMB2_OP_DELETE);
}
static int
@@ -238,8 +389,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
goto smb2_rename_path;
}
- rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, smb2_to_name, command);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
+ FILE_OPEN, 0, smb2_to_name, command);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -269,9 +420,10 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, bool set_alloc)
{
__le64 eof = cpu_to_le64(size);
- return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
- SMB2_OP_SET_EOF);
+
+ return smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
+ SMB2_OP_SET_EOF);
}
int
@@ -291,9 +443,9 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
- FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
- SMB2_OP_SET_INFO);
+ rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path,
+ FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
+ SMB2_OP_SET_INFO);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 20a2d304c603..d47b7f5dfa6c 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -288,7 +288,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"},
{STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"},
{STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"},
- {STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"},
+ {STATUS_NOT_IMPLEMENTED, -EOPNOTSUPP, "STATUS_NOT_IMPLEMENTED"},
{STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"},
{STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"},
{STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"},
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 26f8a65b8722..23c0a21a66f8 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -74,6 +74,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
int *val, rc = 0;
spin_lock(&server->req_lock);
val = server->ops->get_credits_field(server, optype);
+
+ /* eg found case where write overlapping reconnect messed up credits */
+ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0))
+ trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
+ server->hostname, *val);
+
*val += add;
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
@@ -104,7 +110,12 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val)
{
spin_lock(&server->req_lock);
server->credits = val;
+ if (val == 1)
+ server->reconnect_instance++;
spin_unlock(&server->req_lock);
+ /* don't log while holding the lock */
+ if (val == 1)
+ cifs_dbg(FYI, "set credits to 1 due to smb2 reconnect\n");
}
static int *
@@ -270,6 +281,31 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
}
static unsigned int
+smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+ unsigned int wsize;
+
+ /* start with specified wsize, or default */
+ wsize = volume_info->wsize ? volume_info->wsize : SMB3_DEFAULT_IOSIZE;
+ wsize = min_t(unsigned int, wsize, server->max_write);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->rdma) {
+ if (server->sign)
+ wsize = min_t(unsigned int,
+ wsize, server->smbd_conn->max_fragmented_send_size);
+ else
+ wsize = min_t(unsigned int,
+ wsize, server->smbd_conn->max_readwrite_size);
+ }
+#endif
+ if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+
+ return wsize;
+}
+
+static unsigned int
smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
{
struct TCP_Server_Info *server = tcon->ses->server;
@@ -295,6 +331,31 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
return rsize;
}
+static unsigned int
+smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+ unsigned int rsize;
+
+ /* start with specified rsize, or default */
+ rsize = volume_info->rsize ? volume_info->rsize : SMB3_DEFAULT_IOSIZE;
+ rsize = min_t(unsigned int, rsize, server->max_read);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->rdma) {
+ if (server->sign)
+ rsize = min_t(unsigned int,
+ rsize, server->smbd_conn->max_fragmented_recv_size);
+ else
+ rsize = min_t(unsigned int,
+ rsize, server->smbd_conn->max_readwrite_size);
+ }
+#endif
+
+ if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+ rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+
+ return rsize;
+}
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
@@ -962,6 +1023,9 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, "\nBytes read: %llu Bytes written: %llu",
(long long)(tcon->bytes_read),
(long long)(tcon->bytes_written));
+ seq_printf(m, "\nOpen files: %d total (local), %d open on server",
+ atomic_read(&tcon->num_local_opens),
+ atomic_read(&tcon->num_remote_opens));
seq_printf(m, "\nTreeConnects: %d total %d failed",
atomic_read(&sent[SMB2_TREE_CONNECT_HE]),
atomic_read(&failed[SMB2_TREE_CONNECT_HE]));
@@ -1057,6 +1121,131 @@ req_res_key_exit:
return rc;
}
+static int
+smb2_ioctl_query_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ __le16 *path, int is_dir,
+ unsigned long p)
+{
+ struct cifs_ses *ses = tcon->ses;
+ char __user *arg = (char __user *)p;
+ struct smb_query_info qi;
+ struct smb_query_info __user *pqi;
+ int rc = 0;
+ int flags = 0;
+ struct smb2_query_info_rsp *rsp = NULL;
+ void *buffer = NULL;
+ struct smb_rqst rqst[3];
+ int resp_buftype[3];
+ struct kvec rsp_iov[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct cifs_open_parms oparms;
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_fid fid;
+ struct kvec qi_iov[1];
+ struct kvec close_iov[1];
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ if (copy_from_user(&qi, arg, sizeof(struct smb_query_info)))
+ return -EFAULT;
+
+ if (qi.output_buffer_length > 1024)
+ return -EINVAL;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL);
+ if (buffer == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(buffer, arg + sizeof(struct smb_query_info),
+ qi.output_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ memset(&oparms, 0, sizeof(oparms));
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL;
+ oparms.disposition = FILE_OPEN;
+ if (is_dir)
+ oparms.create_options = CREATE_NOT_FILE;
+ else
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path);
+ if (rc)
+ goto iqinf_exit;
+ smb2_set_next_command(ses->server, &rqst[0]);
+
+ /* Query */
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ qi.file_info_class, qi.info_type,
+ qi.additional_information,
+ qi.input_buffer_length,
+ qi.output_buffer_length, buffer);
+ if (rc)
+ goto iqinf_exit;
+ smb2_set_next_command(ses->server, &rqst[1]);
+ smb2_set_related(&rqst[1]);
+
+ /* Close */
+ memset(&close_iov, 0, sizeof(close_iov));
+ rqst[2].rq_iov = close_iov;
+ rqst[2].rq_nvec = 1;
+
+ rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID);
+ if (rc)
+ goto iqinf_exit;
+ smb2_set_related(&rqst[2]);
+
+ rc = compound_send_recv(xid, ses, flags, 3, rqst,
+ resp_buftype, rsp_iov);
+ if (rc)
+ goto iqinf_exit;
+ pqi = (struct smb_query_info __user *)arg;
+ rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length)
+ qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength);
+ if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+ if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) {
+ rc = -EFAULT;
+ goto iqinf_exit;
+ }
+
+ iqinf_exit:
+ kfree(buffer);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ SMB2_close_free(&rqst[2]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ return rc;
+}
+
static ssize_t
smb2_copychunk_range(const unsigned int xid,
struct cifsFileInfo *srcfile,
@@ -1301,7 +1490,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
}
return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof, false);
+ cfile->fid.volatile_fid, cfile->pid, &eof);
}
static int
@@ -1556,7 +1745,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
CIFS_CACHE_READ(cinode) ? 1 : 0);
}
-static void
+void
smb2_set_related(struct smb_rqst *rqst)
{
struct smb2_sync_hdr *shdr;
@@ -1567,7 +1756,7 @@ smb2_set_related(struct smb_rqst *rqst)
char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
-static void
+void
smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst)
{
struct smb2_sync_hdr *shdr;
@@ -1610,7 +1799,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
flags |= CIFS_TRANSFORM_REQ;
memset(rqst, 0, sizeof(rqst));
- memset(resp_buftype, 0, sizeof(resp_buftype));
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
memset(&open_iov, 0, sizeof(open_iov));
@@ -1636,7 +1825,8 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID,
FS_FULL_SIZE_INFORMATION,
SMB2_O_INFO_FILESYSTEM, 0,
- sizeof(struct smb2_fs_full_size_info));
+ sizeof(struct smb2_fs_full_size_info), 0,
+ NULL);
if (rc)
goto qfs_exit;
smb2_set_next_command(server, &rqst[1]);
@@ -3303,6 +3493,7 @@ struct smb_version_operations smb20_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_operations smb21_operations = {
@@ -3398,6 +3589,7 @@ struct smb_version_operations smb21_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_operations smb30_operations = {
@@ -3425,8 +3617,8 @@ struct smb_version_operations smb30_operations = {
.downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
- .negotiate_wsize = smb2_negotiate_wsize,
- .negotiate_rsize = smb2_negotiate_rsize,
+ .negotiate_wsize = smb3_negotiate_wsize,
+ .negotiate_rsize = smb3_negotiate_rsize,
.sess_setup = SMB2_sess_setup,
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
@@ -3502,6 +3694,7 @@ struct smb_version_operations smb30_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_operations smb311_operations = {
@@ -3529,8 +3722,8 @@ struct smb_version_operations smb311_operations = {
.downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
- .negotiate_wsize = smb2_negotiate_wsize,
- .negotiate_rsize = smb2_negotiate_rsize,
+ .negotiate_wsize = smb3_negotiate_wsize,
+ .negotiate_rsize = smb3_negotiate_rsize,
.sess_setup = SMB2_sess_setup,
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
@@ -3607,6 +3800,7 @@ struct smb_version_operations smb311_operations = {
.set_acl = set_smb2_acl,
#endif /* CIFS_ACL */
.next_header = smb2_next_header,
+ .ioctl_query_info = smb2_ioctl_query_info,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f54d07bda067..7d7b016fe8bb 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1478,7 +1478,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
/* SMB2 TREE_CONNECT request must be called with TreeId == 0 */
tcon->tid = 0;
-
+ atomic_set(&tcon->num_remote_opens, 0);
rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req,
&total_len);
if (rc) {
@@ -2243,10 +2243,12 @@ SMB2_open_free(struct smb_rqst *rqst)
{
int i;
- cifs_small_buf_release(rqst->rq_iov[0].iov_base);
- for (i = 1; i < rqst->rq_nvec; i++)
- if (rqst->rq_iov[i].iov_base != smb2_padding)
- kfree(rqst->rq_iov[i].iov_base);
+ if (rqst && rqst->rq_iov) {
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base);
+ for (i = 1; i < rqst->rq_nvec; i++)
+ if (rqst->rq_iov[i].iov_base != smb2_padding)
+ kfree(rqst->rq_iov[i].iov_base);
+ }
}
int
@@ -2261,7 +2263,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
struct cifs_ses *ses = tcon->ses;
struct kvec iov[SMB2_CREATE_IOV_SIZE];
struct kvec rsp_iov = {NULL, 0};
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
@@ -2303,6 +2305,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
ses->Suid, oparms->create_options,
oparms->desired_access);
+ atomic_inc(&tcon->num_remote_opens);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
@@ -2474,13 +2477,13 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- *out_data = kmalloc(*plen, GFP_KERNEL);
+ *out_data = kmemdup((char *)rsp + le32_to_cpu(rsp->OutputOffset),
+ *plen, GFP_KERNEL);
if (*out_data == NULL) {
rc = -ENOMEM;
goto ioctl_exit;
}
- memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen);
ioctl_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -2535,7 +2538,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
void
SMB2_close_free(struct smb_rqst *rqst)
{
- cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
}
int
@@ -2547,7 +2551,7 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_ses *ses = tcon->ses;
struct kvec iov[1];
struct kvec rsp_iov;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
cifs_dbg(FYI, "Close\n");
@@ -2577,6 +2581,8 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon,
goto close_exit;
}
+ atomic_dec(&tcon->num_remote_opens);
+
/* BB FIXME - decode close response, update inode for caching */
close_exit:
@@ -2627,10 +2633,10 @@ smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
* If SMB buffer fields are valid, copy into temporary buffer to hold result.
* Caller must free buffer.
*/
-static int
-validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
- struct kvec *iov, unsigned int minbufsize,
- char *data)
+int
+smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length,
+ struct kvec *iov, unsigned int minbufsize,
+ char *data)
{
char *begin_of_buf = offset + (char *)iov->iov_base;
int rc;
@@ -2651,7 +2657,7 @@ int
SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid,
u8 info_class, u8 info_type, u32 additional_info,
- size_t output_len)
+ size_t output_len, size_t input_len, void *input)
{
struct smb2_query_info_req *req;
struct kvec *iov = rqst->rq_iov;
@@ -2669,23 +2675,25 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
req->VolatileFileId = volatile_fid;
req->AdditionalInformation = cpu_to_le32(additional_info);
- /*
- * We do not use the input buffer (do not send extra byte)
- */
- req->InputBufferOffset = 0;
-
req->OutputBufferLength = cpu_to_le32(output_len);
+ if (input_len) {
+ req->InputBufferLength = cpu_to_le32(input_len);
+ /* total_len for smb query request never close to le16 max */
+ req->InputBufferOffset = cpu_to_le16(total_len - 1);
+ memcpy(req->Buffer, input, input_len);
+ }
iov[0].iov_base = (char *)req;
/* 1 for Buffer */
- iov[0].iov_len = total_len - 1;
+ iov[0].iov_len = total_len - 1 + input_len;
return 0;
}
void
SMB2_query_info_free(struct smb_rqst *rqst)
{
- cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+ if (rqst && rqst->rq_iov)
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
}
static int
@@ -2699,7 +2707,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec iov[1];
struct kvec rsp_iov;
int rc = 0;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
struct cifs_ses *ses = tcon->ses;
int flags = 0;
@@ -2718,7 +2726,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_query_info_init(tcon, &rqst, persistent_fid, volatile_fid,
info_class, info_type, additional_info,
- output_len);
+ output_len, 0, NULL);
if (rc)
goto qinf_exit;
@@ -2746,9 +2754,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
}
}
- rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength),
- &rsp_iov, min_len, *data);
+ rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset),
+ le32_to_cpu(rsp->OutputBufferLength),
+ &rsp_iov, min_len, *data);
qinf_exit:
SMB2_query_info_free(&rqst);
@@ -3754,45 +3762,22 @@ qdir_exit:
return rc;
}
-static int
-send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+int
+SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
- u8 info_type, u32 additional_info, unsigned int num,
+ u8 info_type, u32 additional_info,
void **data, unsigned int *size)
{
- struct smb_rqst rqst;
struct smb2_set_info_req *req;
- struct smb2_set_info_rsp *rsp = NULL;
- struct kvec *iov;
- struct kvec rsp_iov;
- int rc = 0;
- int resp_buftype;
- unsigned int i;
- struct cifs_ses *ses = tcon->ses;
- int flags = 0;
- unsigned int total_len;
-
- if (!ses || !(ses->server))
- return -EIO;
-
- if (!num)
- return -EINVAL;
-
- iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL);
- if (!iov)
- return -ENOMEM;
+ struct kvec *iov = rqst->rq_iov;
+ unsigned int i, total_len;
+ int rc;
rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len);
- if (rc) {
- kfree(iov);
+ if (rc)
return rc;
- }
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
req->sync_hdr.ProcessId = cpu_to_le32(pid);
-
req->InfoType = info_type;
req->FileInfoClass = info_class;
req->PersistentFileId = persistent_fid;
@@ -3810,19 +3795,66 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
/* 1 for Buffer */
iov[0].iov_len = total_len - 1;
- for (i = 1; i < num; i++) {
+ for (i = 1; i < rqst->rq_nvec; i++) {
le32_add_cpu(&req->BufferLength, size[i]);
iov[i].iov_base = (char *)data[i];
iov[i].iov_len = size[i];
}
+ return 0;
+}
+
+void
+SMB2_set_info_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov)
+ cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */
+}
+
+static int
+send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class,
+ u8 info_type, u32 additional_info, unsigned int num,
+ void **data, unsigned int *size)
+{
+ struct smb_rqst rqst;
+ struct smb2_set_info_rsp *rsp = NULL;
+ struct kvec *iov;
+ struct kvec rsp_iov;
+ int rc = 0;
+ int resp_buftype;
+ struct cifs_ses *ses = tcon->ses;
+ int flags = 0;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (!num)
+ return -EINVAL;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL);
+ if (!iov)
+ return -ENOMEM;
+
memset(&rqst, 0, sizeof(struct smb_rqst));
rqst.rq_iov = iov;
rqst.rq_nvec = num;
+ rc = SMB2_set_info_init(tcon, &rqst, persistent_fid, volatile_fid, pid,
+ info_class, info_type, additional_info,
+ data, size);
+ if (rc) {
+ kfree(iov);
+ return rc;
+ }
+
+
rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags,
&rsp_iov);
- cifs_buf_release(req);
+ SMB2_set_info_free(&rqst);
rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
if (rc != 0) {
@@ -3837,88 +3869,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
}
int
-SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
-{
- struct smb2_file_rename_info info;
- void **data;
- unsigned int size[2];
- int rc;
- int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
-
- data = kmalloc_array(2, sizeof(void *), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- info.ReplaceIfExists = 1; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */
- info.FileNameLength = cpu_to_le32(len);
-
- data[0] = &info;
- size[0] = sizeof(struct smb2_file_rename_info);
-
- data[1] = target_file;
- size[1] = len + 2 /* null */;
-
- rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE,
- 0, 2, data, size);
- kfree(data);
- return rc;
-}
-
-int
-SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid)
-{
- __u8 delete_pending = 1;
- void *data;
- unsigned int size;
-
- data = &delete_pending;
- size = 1; /* sizeof __u8 */
-
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE,
- 0, 1, &data, &size);
-}
-
-int
-SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, __le16 *target_file)
-{
- struct smb2_file_link_info info;
- void **data;
- unsigned int size[2];
- int rc;
- int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX));
-
- data = kmalloc_array(2, sizeof(void *), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- info.ReplaceIfExists = 0; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */
- info.FileNameLength = cpu_to_le32(len);
-
- data[0] = &info;
- size[0] = sizeof(struct smb2_file_link_info);
-
- data[1] = target_file;
- size[1] = len + 2 /* null */;
-
- rc = send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE,
- 0, 2, data, size);
- kfree(data);
- return rc;
-}
-
-int
SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
+ u64 volatile_fid, u32 pid, __le64 *eof)
{
struct smb2_file_eof_info info;
void *data;
@@ -3929,28 +3881,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
data = &info;
size = sizeof(struct smb2_file_eof_info);
- if (is_falloc)
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- pid, FILE_ALLOCATION_INFORMATION, SMB2_O_INFO_FILE,
- 0, 1, &data, &size);
- else
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
0, 1, &data, &size);
}
int
-SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf)
-{
- unsigned int size;
- size = sizeof(FILE_BASIC_INFO);
- return send_set_info(xid, tcon, persistent_fid, volatile_fid,
- current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE,
- 0, 1, (void **)&buf, &size);
-}
-
-int
SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
struct cifs_ntsd *pnntsd, int pacllen, int aclflag)
@@ -4350,6 +4286,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buf_type;
+ __u64 *please_key_high;
+ __u64 *please_key_low;
cifs_dbg(FYI, "SMB2_lease_break\n");
rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req,
@@ -4379,10 +4317,16 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
+ please_key_low = (__u64 *)req->LeaseKey;
+ please_key_high = (__u64 *)(req->LeaseKey+8);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
+ trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid,
+ ses->Suid, *please_key_low, *please_key_high, rc);
cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
- }
+ } else
+ trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid,
+ ses->Suid, *please_key_low, *please_key_high);
return rc;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 8fb7887f2b3d..f753f424d7f1 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -613,6 +613,8 @@ struct smb2_tree_disconnect_rsp {
#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83
#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
/*
* Maximum number of iovs we need for an open/create request.
@@ -650,7 +652,7 @@ struct smb2_create_rsp {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 89 */
__u8 OplockLevel;
- __u8 Reserved;
+ __u8 Flag; /* 0x01 if reparse point */
__le32 CreateAction;
__le64 CreationTime;
__le64 LastAccessTime;
@@ -1174,6 +1176,15 @@ struct smb2_query_info_rsp {
__u8 Buffer[1];
} __packed;
+/*
+ * Maximum number of iovs we need for a set-info request.
+ * The largest one is rename/hardlink
+ * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info
+ * [1] : path
+ * [2] : compound padding
+ */
+#define SMB2_SET_INFO_IOV_SIZE 3
+
struct smb2_set_info_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 33 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index b4076577eeb7..9f4e9ed9ce53 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -116,6 +116,9 @@ extern void smb2_reconnect_server(struct work_struct *work);
extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server);
extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
struct smb_rqst *rqst);
+extern void smb2_set_next_command(struct TCP_Server_Info *server,
+ struct smb_rqst *rqst);
+extern void smb2_set_related(struct smb_rqst *rqst);
/*
* SMB2 Worker functions - most of protocol specific implementation details
@@ -160,7 +163,8 @@ extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid,
u8 info_class, u8 info_type,
- u32 additional_info, size_t output_len);
+ u32 additional_info, size_t output_len,
+ size_t input_len, void *input);
extern void SMB2_query_info_free(struct smb_rqst *rqst);
extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
@@ -179,20 +183,14 @@ extern int SMB2_echo(struct TCP_Server_Info *server);
extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, int index,
struct cifs_search_info *srch_inf);
-extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- __le16 *target_file);
-extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid);
-extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- __le16 *target_file);
extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 pid,
- __le64 *eof, bool is_fallocate);
-extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid,
- FILE_BASIC_INFO *buf);
+ __le64 *eof);
+extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid, u32 pid,
+ u8 info_class, u8 info_type, u32 additional_info,
+ void **data, unsigned int *size);
+extern void SMB2_set_info_free(struct smb_rqst *rqst);
extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
struct cifs_ntsd *pnntsd, int pacllen, int aclflag);
@@ -232,6 +230,10 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
+extern int smb2_validate_and_copy_iov(unsigned int offset,
+ unsigned int buffer_length,
+ struct kvec *iov,
+ unsigned int minbufsize, char *data);
extern void smb2_copy_fs_info_to_kstatfs(
struct smb2_fs_full_size_info *pfs_inf,
struct kstatfs *kst);
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5b05bf255268..e94a8d1d08a3 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -2304,8 +2304,12 @@ static void smbd_mr_recovery_work(struct work_struct *work)
int rc;
list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
- if (smbdirect_mr->state == MR_INVALIDATED ||
- smbdirect_mr->state == MR_ERROR) {
+ if (smbdirect_mr->state == MR_INVALIDATED)
+ ib_dma_unmap_sg(
+ info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count,
+ smbdirect_mr->dir);
+ else if (smbdirect_mr->state == MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2329,25 +2333,21 @@ static void smbd_mr_recovery_work(struct work_struct *work)
smbd_disconnect_rdma_connection(info);
continue;
}
+ } else
+ /* This MR is being used, don't recover it */
+ continue;
- if (smbdirect_mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count,
- smbdirect_mr->dir);
-
- smbdirect_mr->state = MR_READY;
+ smbdirect_mr->state = MR_READY;
- /* smbdirect_mr->state is updated by this function
- * and is read and updated by I/O issuing CPUs trying
- * to get a MR, the call to atomic_inc_return
- * implicates a memory barrier and guarantees this
- * value is updated before waking up any calls to
- * get_mr() from the I/O issuing CPUs
- */
- if (atomic_inc_return(&info->mr_ready_count) == 1)
- wake_up_interruptible(&info->wait_mr);
- }
+ /* smbdirect_mr->state is updated by this function
+ * and is read and updated by I/O issuing CPUs trying
+ * to get a MR, the call to atomic_inc_return
+ * implicates a memory barrier and guarantees this
+ * value is updated before waking up any calls to
+ * get_mr() from the I/O issuing CPUs
+ */
+ if (atomic_inc_return(&info->mr_ready_count) == 1)
+ wake_up_interruptible(&info->wait_mr);
}
}
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d4aed5217a56..cce8414fe7ec 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -460,6 +460,85 @@ DEFINE_EVENT(smb3_open_done_class, smb3_##name, \
DEFINE_SMB3_OPEN_DONE_EVENT(open_done);
DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done);
+
+DECLARE_EVENT_CLASS(smb3_lease_done_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high),
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state)
+)
+
+#define DEFINE_SMB3_LEASE_DONE_EVENT(name) \
+DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high), \
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
+
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
+
+DECLARE_EVENT_CLASS(smb3_lease_err_class,
+ TP_PROTO(__u32 lease_state,
+ __u32 tid,
+ __u64 sesid,
+ __u64 lease_key_low,
+ __u64 lease_key_high,
+ int rc),
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc),
+ TP_STRUCT__entry(
+ __field(__u32, lease_state)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, lease_key_low)
+ __field(__u64, lease_key_high)
+ __field(int, rc)
+ ),
+ TP_fast_assign(
+ __entry->lease_state = lease_state;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->lease_key_low = lease_key_low;
+ __entry->lease_key_high = lease_key_high;
+ __entry->rc = rc;
+ ),
+ TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x rc=%d",
+ __entry->sesid, __entry->tid, __entry->lease_key_high,
+ __entry->lease_key_low, __entry->lease_state, __entry->rc)
+)
+
+#define DEFINE_SMB3_LEASE_ERR_EVENT(name) \
+DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \
+ TP_PROTO(__u32 lease_state, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 lease_key_low, \
+ __u64 lease_key_high, \
+ int rc), \
+ TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc))
+
+DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
+
DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_PROTO(__u64 currmid,
char *hostname),
@@ -486,6 +565,36 @@ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \
DEFINE_SMB3_RECONNECT_EVENT(reconnect);
DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect);
+DECLARE_EVENT_CLASS(smb3_credit_class,
+ TP_PROTO(__u64 currmid,
+ char *hostname,
+ int credits),
+ TP_ARGS(currmid, hostname, credits),
+ TP_STRUCT__entry(
+ __field(__u64, currmid)
+ __field(char *, hostname)
+ __field(int, credits)
+ ),
+ TP_fast_assign(
+ __entry->currmid = currmid;
+ __entry->hostname = hostname;
+ __entry->credits = credits;
+ ),
+ TP_printk("server=%s current_mid=0x%llx credits=%d",
+ __entry->hostname,
+ __entry->currmid,
+ __entry->credits)
+)
+
+#define DEFINE_SMB3_CREDIT_EVENT(name) \
+DEFINE_EVENT(smb3_credit_class, smb3_##name, \
+ TP_PROTO(__u64 currmid, \
+ char *hostname, \
+ int credits), \
+ TP_ARGS(currmid, hostname, credits))
+
+DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits);
+
#endif /* _CIFS_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 146b618802a5..83ff0c25710d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -113,9 +113,18 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
cifs_small_buf_release(midEntry->resp_buf);
#ifdef CONFIG_CIFS_STATS2
now = jiffies;
- /* commands taking longer than one second are indications that
- something is wrong, unless it is quite a slow link or server */
- if (time_after(now, midEntry->when_alloc + HZ) &&
+ /*
+ * commands taking longer than one second (default) can be indications
+ * that something is wrong, unless it is quite a slow link or a very
+ * busy server. Note that this calc is unlikely or impossible to wrap
+ * as long as slow_rsp_threshold is not set way above recommended max
+ * value (32767 ie 9 hours) and is generally harmless even if wrong
+ * since only affects debug counters - so leaving the calc as simple
+ * comparison rather than doing multiple conversions and overflow
+ * checks
+ */
+ if ((slow_rsp_threshold != 0) &&
+ time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) &&
(midEntry->command != command)) {
/* smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command */
if ((le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) &&
@@ -128,7 +137,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
if (cifsFYI & CIFS_TIMER) {
pr_debug(" CIFS slow rsp: cmd %d mid %llu",
midEntry->command, midEntry->mid);
- pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
+ cifs_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n",
now - midEntry->when_alloc,
now - midEntry->when_sent,
now - midEntry->when_received);
@@ -784,7 +793,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
int i, j, rc = 0;
int timeout, optype;
struct mid_q_entry *midQ[MAX_COMPOUND];
- unsigned int credits = 1;
+ unsigned int credits = 0;
char *buf;
timeout = flags & CIFS_TIMEOUT_MASK;
@@ -849,21 +858,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
mutex_unlock(&ses->server->srv_mutex);
- for (i = 0; i < num_rqst; i++) {
- if (rc < 0)
- goto out;
+ if (rc < 0)
+ goto out;
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
- smb311_update_preauth_hash(ses, rqst[i].rq_iov,
- rqst[i].rq_nvec);
+ /*
+ * Compounding is never used during session establish.
+ */
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP))
+ smb311_update_preauth_hash(ses, rqst[0].rq_iov,
+ rqst[0].rq_nvec);
- if (timeout == CIFS_ASYNC_OP)
- goto out;
+ if (timeout == CIFS_ASYNC_OP)
+ goto out;
+ for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(ses->server, midQ[i]);
if (rc != 0) {
- cifs_dbg(FYI, "Cancelling wait for mid %llu\n",
- midQ[i]->mid);
+ cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n",
+ midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(ses->server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
@@ -875,10 +887,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
spin_unlock(&GlobalMid_Lock);
}
+ }
+
+ for (i = 0; i < num_rqst; i++)
+ if (midQ[i]->resp_buf)
+ credits += ses->server->ops->get_credits(midQ[i]);
+ if (!credits)
+ credits = 1;
+
+ for (i = 0; i < num_rqst; i++) {
+ if (rc < 0)
+ goto out;
rc = cifs_sync_mid_result(midQ[i], ses->server);
if (rc != 0) {
- add_credits(ses->server, 1, optype);
+ add_credits(ses->server, credits, optype);
return rc;
}
@@ -899,23 +922,26 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
else
resp_buf_type[i] = CIFS_SMALL_BUFFER;
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
- struct kvec iov = {
- .iov_base = resp_iov[i].iov_base,
- .iov_len = resp_iov[i].iov_len
- };
- smb311_update_preauth_hash(ses, &iov, 1);
- }
-
- credits = ses->server->ops->get_credits(midQ[i]);
-
rc = ses->server->ops->check_receive(midQ[i], ses->server,
flags & CIFS_LOG_ERROR);
/* mark it so buf will not be freed by cifs_delete_mid */
if ((flags & CIFS_NO_RESP) == 0)
midQ[i]->resp_buf = NULL;
+
+ }
+
+ /*
+ * Compounding is never used during session establish.
+ */
+ if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) {
+ struct kvec iov = {
+ .iov_base = resp_iov[0].iov_base,
+ .iov_len = resp_iov[0].iov_len
+ };
+ smb311_update_preauth_hash(ses, &iov, 1);
}
+
out:
/*
* This will dequeue all mids. After this it is important that the
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 504b3c3539dc..15f6e96b3bd9 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -52,7 +52,7 @@
#define elf_prpsinfo compat_elf_prpsinfo
#undef ns_to_timeval
-#define ns_to_timeval ns_to_compat_timeval
+#define ns_to_timeval ns_to_old_timeval32
/*
* To use this file, asm/elf.h must define compat_elf_check_arch.
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a9b00942e87d..6e30949d9f77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -22,37 +22,21 @@
#include <linux/smp.h>
#include <linux/ioctl.h>
#include <linux/if.h>
-#include <linux/if_bridge.h>
#include <linux/raid/md_u.h>
-#include <linux/kd.h>
-#include <linux/route.h>
-#include <linux/in6.h>
-#include <linux/ipv6_route.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/vt.h>
#include <linux/falloc.h>
-#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/ppp_defs.h>
#include <linux/ppp-ioctl.h>
#include <linux/if_pppox.h>
#include <linux/mtio.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>
-#include <linux/fb.h>
-#include <linux/videodev2.h>
-#include <linux/netdevice.h>
#include <linux/raw.h>
#include <linux/blkdev.h>
-#include <linux/elevator.h>
#include <linux/rtc.h>
#include <linux/pci.h>
#include <linux/serial.h>
-#include <linux/if_tun.h>
#include <linux/ctype.h>
#include <linux/syscalls.h>
-#include <linux/atalk.h>
#include <linux/gfp.h>
#include <linux/cec.h>
@@ -74,44 +58,17 @@
#endif
#include <linux/uaccess.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_bonding.h>
#include <linux/watchdog.h>
#include <linux/soundcard.h>
-#include <linux/lp.h>
-#include <linux/ppdev.h>
-
-#include <linux/atm.h>
-#include <linux/atmarp.h>
-#include <linux/atmclip.h>
-#include <linux/atmdev.h>
-#include <linux/atmioc.h>
-#include <linux/atmlec.h>
-#include <linux/atmmpc.h>
-#include <linux/atmsvc.h>
-#include <linux/atm_tcp.h>
-#include <linux/sonet.h>
-#include <linux/atm_suni.h>
-
-#include <linux/usb.h>
-#include <linux/usbdevice_fs.h>
-#include <linux/nbd.h>
-#include <linux/random.h>
-#include <linux/filter.h>
#include <linux/hiddev.h>
-#define __DVB_CORE__
-#include <linux/dvb/audio.h>
-#include <linux/dvb/dmx.h>
-#include <linux/dvb/frontend.h>
-#include <linux/dvb/video.h>
#include <linux/sort.h>
#ifdef CONFIG_SPARC
+#include <linux/fb.h>
#include <asm/fbio.h>
#endif
@@ -133,71 +90,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return vfs_ioctl(file, cmd, arg);
}
-struct compat_video_event {
- int32_t type;
- compat_time_t timestamp;
- union {
- video_size_t size;
- unsigned int frame_rate;
- } u;
-};
-
-static int do_video_get_event(struct file *file,
- unsigned int cmd, struct compat_video_event __user *up)
-{
- struct video_event __user *kevent =
- compat_alloc_user_space(sizeof(*kevent));
- int err;
-
- if (kevent == NULL)
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long)kevent);
- if (!err) {
- err = convert_in_user(&kevent->type, &up->type);
- err |= convert_in_user(&kevent->timestamp, &up->timestamp);
- err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
- err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
- err |= convert_in_user(&kevent->u.size.aspect_ratio,
- &up->u.size.aspect_ratio);
- if (err)
- err = -EFAULT;
- }
-
- return err;
-}
-
-struct compat_video_still_picture {
- compat_uptr_t iFrame;
- int32_t size;
-};
-
-static int do_video_stillpicture(struct file *file,
- unsigned int cmd, struct compat_video_still_picture __user *up)
-{
- struct video_still_picture __user *up_native;
- compat_uptr_t fp;
- int32_t size;
- int err;
-
- err = get_user(fp, &up->iFrame);
- err |= get_user(size, &up->size);
- if (err)
- return -EFAULT;
-
- up_native =
- compat_alloc_user_space(sizeof(struct video_still_picture));
-
- err = put_user(compat_ptr(fp), &up_native->iFrame);
- err |= put_user(size, &up_native->size);
- if (err)
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long) up_native);
-
- return err;
-}
-
#ifdef CONFIG_BLOCK
typedef struct sg_io_hdr32 {
compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
@@ -544,85 +436,6 @@ static int mt_ioctl_trans(struct file *file,
#define HCIUARTSETFLAGS _IOW('U', 203, int)
#define HCIUARTGETFLAGS _IOR('U', 204, int)
-#define BNEPCONNADD _IOW('B', 200, int)
-#define BNEPCONNDEL _IOW('B', 201, int)
-#define BNEPGETCONNLIST _IOR('B', 210, int)
-#define BNEPGETCONNINFO _IOR('B', 211, int)
-#define BNEPGETSUPPFEAT _IOR('B', 212, int)
-
-#define CMTPCONNADD _IOW('C', 200, int)
-#define CMTPCONNDEL _IOW('C', 201, int)
-#define CMTPGETCONNLIST _IOR('C', 210, int)
-#define CMTPGETCONNINFO _IOR('C', 211, int)
-
-#define HIDPCONNADD _IOW('H', 200, int)
-#define HIDPCONNDEL _IOW('H', 201, int)
-#define HIDPGETCONNLIST _IOR('H', 210, int)
-#define HIDPGETCONNINFO _IOR('H', 211, int)
-
-
-struct serial_struct32 {
- compat_int_t type;
- compat_int_t line;
- compat_uint_t port;
- compat_int_t irq;
- compat_int_t flags;
- compat_int_t xmit_fifo_size;
- compat_int_t custom_divisor;
- compat_int_t baud_base;
- unsigned short close_delay;
- char io_type;
- char reserved_char[1];
- compat_int_t hub6;
- unsigned short closing_wait; /* time to wait before closing */
- unsigned short closing_wait2; /* no longer used... */
- compat_uint_t iomem_base;
- unsigned short iomem_reg_shift;
- unsigned int port_high;
- /* compat_ulong_t iomap_base FIXME */
- compat_int_t reserved[1];
-};
-
-static int serial_struct_ioctl(struct file *file,
- unsigned cmd, struct serial_struct32 __user *ss32)
-{
- typedef struct serial_struct32 SS32;
- int err;
- struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
- __u32 udata;
- unsigned int base;
- unsigned char *iomem_base;
-
- if (ss == NULL)
- return -EFAULT;
- if (cmd == TIOCSSERIAL) {
- if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
- get_user(udata, &ss32->iomem_base))
- return -EFAULT;
- iomem_base = compat_ptr(udata);
- if (put_user(iomem_base, &ss->iomem_base) ||
- convert_in_user(&ss32->iomem_reg_shift,
- &ss->iomem_reg_shift) ||
- convert_in_user(&ss32->port_high, &ss->port_high) ||
- put_user(0UL, &ss->iomap_base))
- return -EFAULT;
- }
- err = do_ioctl(file, cmd, (unsigned long)ss);
- if (cmd == TIOCGSERIAL && err >= 0) {
- if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
- get_user(iomem_base, &ss->iomem_base))
- return -EFAULT;
- base = (unsigned long)iomem_base >> 32 ?
- 0xffffffff : (unsigned)(unsigned long)iomem_base;
- if (put_user(base, &ss32->iomem_base) ||
- convert_in_user(&ss->iomem_reg_shift,
- &ss32->iomem_reg_shift) ||
- convert_in_user(&ss->port_high, &ss32->port_high))
- return -EFAULT;
- }
- return err;
-}
-
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
@@ -707,60 +520,8 @@ static int compat_ioctl_preallocate(struct file *file,
static unsigned int ioctl_pointer[] = {
/* compatible ioctls first */
-COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */
-
-/* Big T */
-COMPATIBLE_IOCTL(TCGETA)
-COMPATIBLE_IOCTL(TCSETA)
-COMPATIBLE_IOCTL(TCSETAW)
-COMPATIBLE_IOCTL(TCSETAF)
-COMPATIBLE_IOCTL(TCSBRK)
-COMPATIBLE_IOCTL(TCXONC)
-COMPATIBLE_IOCTL(TCFLSH)
-COMPATIBLE_IOCTL(TCGETS)
-COMPATIBLE_IOCTL(TCSETS)
-COMPATIBLE_IOCTL(TCSETSW)
-COMPATIBLE_IOCTL(TCSETSF)
-COMPATIBLE_IOCTL(TIOCLINUX)
-COMPATIBLE_IOCTL(TIOCSBRK)
-COMPATIBLE_IOCTL(TIOCGDEV)
-COMPATIBLE_IOCTL(TIOCCBRK)
-COMPATIBLE_IOCTL(TIOCGSID)
-COMPATIBLE_IOCTL(TIOCGICOUNT)
-COMPATIBLE_IOCTL(TIOCGEXCL)
/* Little t */
-COMPATIBLE_IOCTL(TIOCGETD)
-COMPATIBLE_IOCTL(TIOCSETD)
-COMPATIBLE_IOCTL(TIOCEXCL)
-COMPATIBLE_IOCTL(TIOCNXCL)
-COMPATIBLE_IOCTL(TIOCCONS)
-COMPATIBLE_IOCTL(TIOCGSOFTCAR)
-COMPATIBLE_IOCTL(TIOCSSOFTCAR)
-COMPATIBLE_IOCTL(TIOCSWINSZ)
-COMPATIBLE_IOCTL(TIOCGWINSZ)
-COMPATIBLE_IOCTL(TIOCMGET)
-COMPATIBLE_IOCTL(TIOCMBIC)
-COMPATIBLE_IOCTL(TIOCMBIS)
-COMPATIBLE_IOCTL(TIOCMSET)
-COMPATIBLE_IOCTL(TIOCNOTTY)
-COMPATIBLE_IOCTL(TIOCSTI)
COMPATIBLE_IOCTL(TIOCOUTQ)
-COMPATIBLE_IOCTL(TIOCSPGRP)
-COMPATIBLE_IOCTL(TIOCGPGRP)
-COMPATIBLE_IOCTL(TIOCSERGETLSR)
-#ifdef TIOCSRS485
-COMPATIBLE_IOCTL(TIOCSRS485)
-#endif
-#ifdef TIOCGRS485
-COMPATIBLE_IOCTL(TIOCGRS485)
-#endif
-#ifdef TCGETS2
-COMPATIBLE_IOCTL(TCGETS2)
-COMPATIBLE_IOCTL(TCSETS2)
-COMPATIBLE_IOCTL(TCSETSW2)
-COMPATIBLE_IOCTL(TCSETSF2)
-#endif
/* Little f */
COMPATIBLE_IOCTL(FIOCLEX)
COMPATIBLE_IOCTL(FIONCLEX)
@@ -775,23 +536,6 @@ COMPATIBLE_IOCTL(FIGETBSZ)
COMPATIBLE_IOCTL(FIFREEZE)
COMPATIBLE_IOCTL(FITHAW)
COMPATIBLE_IOCTL(FITRIM)
-COMPATIBLE_IOCTL(KDGETKEYCODE)
-COMPATIBLE_IOCTL(KDSETKEYCODE)
-COMPATIBLE_IOCTL(KDGKBTYPE)
-COMPATIBLE_IOCTL(KDGETMODE)
-COMPATIBLE_IOCTL(KDGKBMODE)
-COMPATIBLE_IOCTL(KDGKBMETA)
-COMPATIBLE_IOCTL(KDGKBENT)
-COMPATIBLE_IOCTL(KDSKBENT)
-COMPATIBLE_IOCTL(KDGKBSENT)
-COMPATIBLE_IOCTL(KDSKBSENT)
-COMPATIBLE_IOCTL(KDGKBDIACR)
-COMPATIBLE_IOCTL(KDSKBDIACR)
-COMPATIBLE_IOCTL(KDGKBDIACRUC)
-COMPATIBLE_IOCTL(KDSKBDIACRUC)
-COMPATIBLE_IOCTL(KDKBDREP)
-COMPATIBLE_IOCTL(KDGKBLED)
-COMPATIBLE_IOCTL(KDGETLED)
#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -1106,19 +850,6 @@ COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
-COMPATIBLE_IOCTL(BNEPCONNADD)
-COMPATIBLE_IOCTL(BNEPCONNDEL)
-COMPATIBLE_IOCTL(BNEPGETCONNLIST)
-COMPATIBLE_IOCTL(BNEPGETCONNINFO)
-COMPATIBLE_IOCTL(BNEPGETSUPPFEAT)
-COMPATIBLE_IOCTL(CMTPCONNADD)
-COMPATIBLE_IOCTL(CMTPCONNDEL)
-COMPATIBLE_IOCTL(CMTPGETCONNLIST)
-COMPATIBLE_IOCTL(CMTPGETCONNINFO)
-COMPATIBLE_IOCTL(HIDPCONNADD)
-COMPATIBLE_IOCTL(HIDPCONNDEL)
-COMPATIBLE_IOCTL(HIDPGETCONNLIST)
-COMPATIBLE_IOCTL(HIDPGETCONNINFO)
/* CAPI */
COMPATIBLE_IOCTL(CAPI_REGISTER)
COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
@@ -1133,11 +864,6 @@ COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
-/* Siemens Gigaset */
-COMPATIBLE_IOCTL(GIGASET_REDIR)
-COMPATIBLE_IOCTL(GIGASET_CONFIG)
-COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
-COMPATIBLE_IOCTL(GIGASET_VERSION)
/* Misc. */
COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */
COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */
@@ -1162,82 +888,12 @@ COMPATIBLE_IOCTL(HIDIOCGFLAG)
COMPATIBLE_IOCTL(HIDIOCSFLAG)
COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
-/* dvb */
-COMPATIBLE_IOCTL(AUDIO_STOP)
-COMPATIBLE_IOCTL(AUDIO_PLAY)
-COMPATIBLE_IOCTL(AUDIO_PAUSE)
-COMPATIBLE_IOCTL(AUDIO_CONTINUE)
-COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(AUDIO_SET_MUTE)
-COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC)
-COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE)
-COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT)
-COMPATIBLE_IOCTL(AUDIO_GET_STATUS)
-COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(AUDIO_SET_ID)
-COMPATIBLE_IOCTL(AUDIO_SET_MIXER)
-COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(DMX_START)
-COMPATIBLE_IOCTL(DMX_STOP)
-COMPATIBLE_IOCTL(DMX_SET_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
-COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
-COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
-COMPATIBLE_IOCTL(DMX_GET_STC)
-COMPATIBLE_IOCTL(DMX_REQBUFS)
-COMPATIBLE_IOCTL(DMX_QUERYBUF)
-COMPATIBLE_IOCTL(DMX_EXPBUF)
-COMPATIBLE_IOCTL(DMX_QBUF)
-COMPATIBLE_IOCTL(DMX_DQBUF)
-COMPATIBLE_IOCTL(VIDEO_STOP)
-COMPATIBLE_IOCTL(VIDEO_PLAY)
-COMPATIBLE_IOCTL(VIDEO_FREEZE)
-COMPATIBLE_IOCTL(VIDEO_CONTINUE)
-COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE)
-COMPATIBLE_IOCTL(VIDEO_SET_BLANK)
-COMPATIBLE_IOCTL(VIDEO_GET_STATUS)
-COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD)
-COMPATIBLE_IOCTL(VIDEO_SLOWMOTION)
-COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES)
-COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER)
-COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE)
-COMPATIBLE_IOCTL(VIDEO_SET_FORMAT)
-COMPATIBLE_IOCTL(VIDEO_GET_SIZE)
-/* cec */
-COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS)
-COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR)
-COMPATIBLE_IOCTL(CEC_G_MODE)
-COMPATIBLE_IOCTL(CEC_S_MODE)
-COMPATIBLE_IOCTL(CEC_TRANSMIT)
-COMPATIBLE_IOCTL(CEC_RECEIVE)
-COMPATIBLE_IOCTL(CEC_DQEVENT)
-
/* joystick */
COMPATIBLE_IOCTL(JSIOCGVERSION)
COMPATIBLE_IOCTL(JSIOCGAXES)
COMPATIBLE_IOCTL(JSIOCGBUTTONS)
COMPATIBLE_IOCTL(JSIOCGNAME(0))
-#ifdef TIOCGLTC
-COMPATIBLE_IOCTL(TIOCGLTC)
-COMPATIBLE_IOCTL(TIOCSLTC)
-#endif
-#ifdef TIOCSTART
-/*
- * For these two we have definitions in ioctls.h and/or termios.h on
- * some architectures but no actual implemention. Some applications
- * like bash call them if they are defined in the headers, so we provide
- * entries here to avoid syslog message spew.
- */
-COMPATIBLE_IOCTL(TIOCSTART)
-COMPATIBLE_IOCTL(TIOCSTOP)
-#endif
-
/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
but we don't want warnings on other file systems. So declare
them as compatible here. */
@@ -1293,22 +949,12 @@ static long do_ioctl_trans(unsigned int cmd,
case MTIOCPOS32:
return mt_ioctl_trans(file, cmd, argp);
#endif
- /* Serial */
- case TIOCGSERIAL:
- case TIOCSSERIAL:
- return serial_struct_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
return rtc_ioctl(file, cmd, argp);
-
- /* dvb */
- case VIDEO_GET_EVENT:
- return do_video_get_event(file, cmd, argp);
- case VIDEO_STILLPICTURE:
- return do_video_stillpicture(file, cmd, argp);
}
/*
@@ -1316,24 +962,11 @@ static long do_ioctl_trans(unsigned int cmd,
* so we must not do a compat_ptr() translation.
*/
switch (cmd) {
- /* Big T */
- case TCSBRKP:
- case TIOCMIWAIT:
- case TIOCSCTTY:
/* RAID */
case HOT_REMOVE_DISK:
case HOT_ADD_DISK:
case SET_DISK_FAULTY:
case SET_BITMAP_FILE:
- /* Big K */
- case KDSIGACCEPT:
- case KIOCSOUND:
- case KDMKTONE:
- case KDSETMODE:
- case KDSKBMODE:
- case KDSKBMETA:
- case KDSKBLED:
- case KDSETLED:
return vfs_ioctl(file, cmd, arg);
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 1e2c87acac9b..e42e17e55bfd 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -536,7 +536,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return err;
}
-void do_coredump(const siginfo_t *siginfo)
+void do_coredump(const kernel_siginfo_t *siginfo)
{
struct core_state core_state;
struct core_name cn;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f408994fc632..9352487bd0fc 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -202,7 +202,8 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
continue;
blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
blk_offset += offset;
- if (blk_offset + len > BUFFER_SIZE)
+ if (blk_offset > BUFFER_SIZE ||
+ blk_offset + len > BUFFER_SIZE)
continue;
return read_buffers[i] + blk_offset;
}
@@ -418,9 +419,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
int i;
vma->vm_flags |= VM_MIXEDMAP;
for (i = 0; i < pages && !ret; i++) {
+ vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ if (vmf & VM_FAULT_ERROR)
+ ret = vm_fault_to_errno(vmf, 0);
}
}
@@ -869,8 +873,8 @@ static int cramfs_readpage(struct file *file, struct page *page)
if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
/* See comments on earlier code. */
u32 prev_start = block_start;
- block_start = prev_start & ~CRAMFS_BLK_FLAGS;
- block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+ block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+ block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
block_start += PAGE_SIZE;
} else {
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 39c20ef26db4..79debfc9cef9 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -83,10 +83,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
return true;
- if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS &&
- filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS)
- return true;
-
return false;
}
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index e997ca51192f..7874c9bb2fc5 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -174,16 +174,6 @@ static struct fscrypt_mode {
.cipher_str = "cts(cbc(aes))",
.keysize = 16,
},
- [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = {
- .friendly_name = "Speck128/256-XTS",
- .cipher_str = "xts(speck128)",
- .keysize = 64,
- },
- [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = {
- .friendly_name = "Speck128/256-CTS-CBC",
- .cipher_str = "cts(cbc(speck128))",
- .keysize = 32,
- },
};
static struct fscrypt_mode *
diff --git a/fs/dax.c b/fs/dax.c
index 0fb270f0a0ef..616e36ea6aaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -38,6 +38,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+ if (pe_size == PE_SIZE_PTE)
+ return PAGE_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PUD)
+ return PUD_SHIFT - PAGE_SHIFT;
+ return ~0;
+}
+
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -46,6 +57,9 @@
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
+/* The order of a PMD entry */
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void)
fs_initcall(init_dax_wait_table);
/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking. In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking. In total four special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT (4)
+#define DAX_LOCKED (1UL << 0)
+#define DAX_PMD (1UL << 1)
+#define DAX_ZERO_PAGE (1UL << 2)
+#define DAX_EMPTY (1UL << 3)
-static unsigned long dax_radix_pfn(void *entry)
+static unsigned long dax_to_pfn(void *entry)
{
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
+ return xa_to_value(entry) >> DAX_SHIFT;
}
-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+ return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
}
-static unsigned int dax_radix_order(void *entry)
+static void *dax_make_page_entry(struct page *page)
{
- if ((unsigned long)entry & RADIX_DAX_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
+ pfn_t pfn = page_to_pfn_t(page);
+ return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0);
+}
+
+static bool dax_is_locked(void *entry)
+{
+ return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+ if (xa_to_value(entry) & DAX_PMD)
+ return PMD_ORDER;
return 0;
}
static int dax_is_pmd_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_PMD;
+ return xa_to_value(entry) & DAX_PMD;
}
static int dax_is_pte_entry(void *entry)
{
- return !((unsigned long)entry & RADIX_DAX_PMD);
+ return !(xa_to_value(entry) & DAX_PMD);
}
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_EMPTY;
+ return xa_to_value(entry) & DAX_EMPTY;
}
/*
- * DAX radix tree locking
+ * DAX page cache entry locking
*/
struct exceptional_entry_key {
- struct address_space *mapping;
+ struct xarray *xa;
pgoff_t entry_start;
};
@@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
- pgoff_t index, void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
+ void *entry, struct exceptional_entry_key *key)
{
unsigned long hash;
+ unsigned long index = xas->xa_index;
/*
* If 'entry' is a PMD, align the 'index' that we use for the wait
@@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
*/
if (dax_is_pmd_entry(entry))
index &= ~PG_PMD_COLOUR;
-
- key->mapping = mapping;
+ key->xa = xas->xa;
key->entry_start = index;
- hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
- int sync, void *keyp)
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
struct wait_exceptional_entry_queue *ewait =
container_of(wait, struct wait_exceptional_entry_queue, wait);
- if (key->mapping != ewait->key.mapping ||
+ if (key->xa != ewait->key.xa ||
key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
@@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
*/
-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
+static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
+ wq = dax_entry_waitqueue(xas, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
- * Check whether the given slot is locked. Must be called with the i_pages
- * lock held.
- */
-static inline int slot_locked(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
- return entry & RADIX_DAX_ENTRY_LOCK;
-}
-
-/*
- * Mark the given slot as locked. Must be called with the i_pages lock held.
- */
-static inline void *lock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Mark the given slot as unlocked. Must be called with the i_pages lock held.
- */
-static inline void *unlock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
- * put_unlocked_mapping_entry() when he decided not to lock the entry or
- * put_locked_mapping_entry() when he locked the entry and now wants to
- * unlock it.
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it. The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did.
*
* Must be called with the i_pages lock held.
*/
-static void *__get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp, bool (*wait_fn)(void))
+static void *get_unlocked_entry(struct xa_state *xas)
{
- void *entry, **slot;
+ void *entry;
struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
@@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
- bool revalidate;
-
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
- &slot);
- if (!entry ||
- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
- !slot_locked(mapping, slot)) {
- if (slotp)
- *slotp = slot;
+ entry = xas_load(xas);
+ if (!entry || xa_is_internal(entry) ||
+ WARN_ON_ONCE(!xa_is_value(entry)) ||
+ !dax_is_locked(entry))
return entry;
- }
- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- xa_unlock_irq(&mapping->i_pages);
- revalidate = wait_fn();
+ xas_unlock_irq(xas);
+ xas_reset(xas);
+ schedule();
finish_wait(wq, &ewait.wait);
- xa_lock_irq(&mapping->i_pages);
- if (revalidate)
- return ERR_PTR(-EAGAIN);
+ xas_lock_irq(xas);
}
}
-static bool entry_wait(void)
-{
- schedule();
- /*
- * Never return an ERR_PTR() from
- * __get_unlocked_mapping_entry(), just keep looping.
- */
- return false;
-}
-
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp)
+static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
- return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
-}
-
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- void *entry, **slot;
-
- xa_lock_irq(&mapping->i_pages);
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
- !slot_locked(mapping, slot))) {
- xa_unlock_irq(&mapping->i_pages);
- return;
- }
- unlock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ /* If we were the only waiter woken, wake the next one */
+ if (entry)
+ dax_wake_entry(xas, entry, false);
}
-static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index)
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
- unlock_mapping_entry(mapping, index);
+ void *old;
+
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ old = xas_store(xas, entry);
+ xas_unlock_irq(xas);
+ BUG_ON(!dax_is_locked(old));
+ dax_wake_entry(xas, entry, false);
}
/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ * Return: The entry stored at this location before it was locked.
*/
-static void put_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
- if (!entry)
- return;
-
- /* We have to wake up next waiter for the radix tree entry lock */
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ unsigned long v = xa_to_value(entry);
+ return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}
static unsigned long dax_entry_size(void *entry)
@@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_radix_end_pfn(void *entry)
+static unsigned long dax_end_pfn(void *entry)
{
- return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}
/*
@@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry)
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_radix_pfn(entry); \
- pfn < dax_radix_end_pfn(entry); pfn++)
+ for (pfn = dax_to_pfn(entry); \
+ pfn < dax_end_pfn(entry); pfn++)
/*
* TODO: for reflink+dax we need a way to associate a single page with
@@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
-static bool entry_wait_revalidate(void)
-{
- rcu_read_unlock();
- schedule();
- rcu_read_lock();
-
- /*
- * Tell __get_unlocked_mapping_entry() to take a break, we need
- * to revalidate page->mapping after dropping locks
- */
- return true;
-}
-
bool dax_lock_mapping_entry(struct page *page)
{
- pgoff_t index;
- struct inode *inode;
- bool did_lock = false;
- void *entry = NULL, **slot;
- struct address_space *mapping;
+ XA_STATE(xas, NULL, 0);
+ void *entry;
- rcu_read_lock();
for (;;) {
- mapping = READ_ONCE(page->mapping);
+ struct address_space *mapping = READ_ONCE(page->mapping);
if (!dax_mapping(mapping))
- break;
+ return false;
/*
* In the device-dax case there's no need to lock, a
@@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page)
* otherwise we would not have a valid pfn_to_page()
* translation.
*/
- inode = mapping->host;
- if (S_ISCHR(inode->i_mode)) {
- did_lock = true;
- break;
- }
+ if (S_ISCHR(mapping->host->i_mode))
+ return true;
- xa_lock_irq(&mapping->i_pages);
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
if (mapping != page->mapping) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
continue;
}
- index = page->index;
-
- entry = __get_unlocked_mapping_entry(mapping, index, &slot,
- entry_wait_revalidate);
- if (!entry) {
- xa_unlock_irq(&mapping->i_pages);
- break;
- } else if (IS_ERR(entry)) {
- xa_unlock_irq(&mapping->i_pages);
- WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
- continue;
+ xas_set(&xas, page->index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ entry = get_unlocked_entry(&xas);
+ /* Did the page move while we slept? */
+ if (dax_to_pfn(entry) != page_to_pfn(page)) {
+ xas_unlock_irq(&xas);
+ continue;
+ }
}
- lock_slot(mapping, slot);
- did_lock = true;
- xa_unlock_irq(&mapping->i_pages);
- break;
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ return true;
}
- rcu_read_unlock();
-
- return did_lock;
}
void dax_unlock_mapping_entry(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
+ XA_STATE(xas, &mapping->i_pages, page->index);
- if (S_ISCHR(inode->i_mode))
+ if (S_ISCHR(mapping->host->i_mode))
return;
- unlock_mapping_entry(mapping, page->index);
+ dax_unlock_entry(&xas, dax_make_page_entry(page));
}
/*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
*
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries within the 2MiB range that we are
- * requesting.
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
*
- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
- * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
- * insertion will fail if it finds any 4k entries already in the tree, and a
- * 4k insertion will cause an existing 2MiB entry to be unmapped and
- * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
- * well as 2MiB empty entries.
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries. This happens for both PMD zero pages as
+ * well as PMD empty entries.
*
- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
- * real storage backing them. We will leave these real 2MiB DAX entries in
- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them. We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
*
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
+ *
+ * On error, this function does not return an ERR_PTR. Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
+ * overlap with xarray value entries.
*/
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
- unsigned long size_flag)
+static void *grab_mapping_entry(struct xa_state *xas,
+ struct address_space *mapping, unsigned long size_flag)
{
- bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
- void *entry, **slot;
-
-restart:
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ unsigned long index = xas->xa_index;
+ bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ void *entry;
- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
- entry = ERR_PTR(-EIO);
- goto out_unlock;
- }
+retry:
+ xas_lock_irq(xas);
+ entry = get_unlocked_entry(xas);
+ if (xa_is_internal(entry))
+ goto fallback;
if (entry) {
- if (size_flag & RADIX_DAX_PMD) {
+ if (WARN_ON_ONCE(!xa_is_value(entry))) {
+ xas_set_err(xas, EIO);
+ goto out_unlock;
+ }
+
+ if (size_flag & DAX_PMD) {
if (dax_is_pte_entry(entry)) {
- put_unlocked_mapping_entry(mapping, index,
- entry);
- entry = ERR_PTR(-EEXIST);
- goto out_unlock;
+ put_unlocked_entry(xas, entry);
+ goto fallback;
}
} else { /* trying to grab a PTE entry */
if (dax_is_pmd_entry(entry) &&
@@ -530,87 +468,57 @@ restart:
}
}
- /* No entry for given index? Make sure radix tree is big enough. */
- if (!entry || pmd_downgrade) {
- int err;
-
- if (pmd_downgrade) {
- /*
- * Make sure 'entry' remains valid while we drop
- * the i_pages lock.
- */
- entry = lock_slot(mapping, slot);
- }
+ if (pmd_downgrade) {
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * the i_pages lock.
+ */
+ dax_lock_entry(xas, entry);
- xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
* unmapped.
*/
- if (pmd_downgrade && dax_is_zero_entry(entry))
- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
-
- err = radix_tree_preload(
- mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
- if (err) {
- if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index);
- return ERR_PTR(err);
- }
- xa_lock_irq(&mapping->i_pages);
-
- if (!entry) {
- /*
- * We needed to drop the i_pages lock while calling
- * radix_tree_preload() and we didn't have an entry to
- * lock. See if another thread inserted an entry at
- * our index during this time.
- */
- entry = __radix_tree_lookup(&mapping->i_pages, index,
- NULL, &slot);
- if (entry) {
- radix_tree_preload_end();
- xa_unlock_irq(&mapping->i_pages);
- goto restart;
- }
+ if (dax_is_zero_entry(entry)) {
+ xas_unlock_irq(xas);
+ unmap_mapping_pages(mapping,
+ xas->xa_index & ~PG_PMD_COLOUR,
+ PG_PMD_NR, false);
+ xas_reset(xas);
+ xas_lock_irq(xas);
}
- if (pmd_downgrade) {
- dax_disassociate_entry(entry, mapping, false);
- radix_tree_delete(&mapping->i_pages, index);
- mapping->nrexceptional--;
- dax_wake_mapping_entry_waiter(mapping, index, entry,
- true);
- }
+ dax_disassociate_entry(entry, mapping, false);
+ xas_store(xas, NULL); /* undo the PMD join */
+ dax_wake_entry(xas, entry, true);
+ mapping->nrexceptional--;
+ entry = NULL;
+ xas_set(xas, index);
+ }
- entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
-
- err = __radix_tree_insert(&mapping->i_pages, index,
- dax_radix_order(entry), entry);
- radix_tree_preload_end();
- if (err) {
- xa_unlock_irq(&mapping->i_pages);
- /*
- * Our insertion of a DAX entry failed, most likely
- * because we were inserting a PMD entry and it
- * collided with a PTE sized entry at a different
- * index in the PMD range. We haven't inserted
- * anything into the radix tree and have no waiters to
- * wake.
- */
- return ERR_PTR(err);
- }
- /* Good, we have inserted empty locked entry into the tree. */
+ if (entry) {
+ dax_lock_entry(xas, entry);
+ } else {
+ entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+ dax_lock_entry(xas, entry);
+ if (xas_error(xas))
+ goto out_unlock;
mapping->nrexceptional++;
- xa_unlock_irq(&mapping->i_pages);
- return entry;
}
- entry = lock_slot(mapping, slot);
- out_unlock:
- xa_unlock_irq(&mapping->i_pages);
+
+out_unlock:
+ xas_unlock_irq(xas);
+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+ goto retry;
+ if (xas->xa_node == XA_ERROR(-ENOMEM))
+ return xa_mk_internal(VM_FAULT_OOM);
+ if (xas_error(xas))
+ return xa_mk_internal(VM_FAULT_SIGBUS);
return entry;
+fallback:
+ xas_unlock_irq(xas);
+ return xa_mk_internal(VM_FAULT_FALLBACK);
}
/**
@@ -630,11 +538,10 @@ restart:
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
- pgoff_t indices[PAGEVEC_SIZE];
+ XA_STATE(xas, &mapping->i_pages, 0);
+ void *entry;
+ unsigned int scanned = 0;
struct page *page = NULL;
- struct pagevec pvec;
- pgoff_t index, end;
- unsigned i;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
@@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
- pagevec_init(&pvec);
- index = 0;
- end = -1;
-
/*
* If we race get_user_pages_fast() here either we'll see the
- * elevated page count in the pagevec_lookup and wait, or
+ * elevated page count in the iteration and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
@@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
*/
unmap_mapping_range(mapping, 0, 0, 1);
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- pgoff_t nr_pages = 1;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *pvec_ent = pvec.pages[i];
- void *entry;
-
- index = indices[i];
- if (index >= end)
- break;
-
- if (WARN_ON_ONCE(
- !radix_tree_exceptional_entry(pvec_ent)))
- continue;
-
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (entry) {
- page = dax_busy_page(entry);
- /*
- * Account for multi-order entries at
- * the end of the pagevec.
- */
- if (i + 1 >= pagevec_count(&pvec))
- nr_pages = 1UL << dax_radix_order(entry);
- }
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
- if (page)
- break;
- }
-
- /*
- * We don't expect normal struct page entries to exist in our
- * tree, but we keep these pagevec calls so that this code is
- * consistent with the common pattern for handling pagevecs
- * throughout the kernel.
- */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- index += nr_pages;
-
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (WARN_ON_ONCE(!xa_is_value(entry)))
+ continue;
+ if (unlikely(dax_is_locked(entry)))
+ entry = get_unlocked_entry(&xas);
+ if (entry)
+ page = dax_busy_page(entry);
+ put_unlocked_entry(&xas, entry);
if (page)
break;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
+ xas_unlock_irq(&xas);
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
+ XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
- struct radix_tree_root *pages = &mapping->i_pages;
- xa_lock_irq(pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
- (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
+ (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
dax_disassociate_entry(entry, mapping, trunc);
- radix_tree_delete(pages, index);
+ xas_store(&xas, NULL);
mapping->nrexceptional--;
ret = 1;
out:
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(pages);
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
return ret;
}
+
/*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping. Wait for it
+ * to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
- int ret = __dax_invalidate_mapping_entry(mapping, index, true);
+ int ret = __dax_invalidate_entry(mapping, index, true);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
- * radix tree (usually fs-private i_mmap_sem for writing). Since the
- * caller has seen exceptional entry for this index, we better find it
+ * page cache (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen a DAX entry for this index, we better find it
* at that index as well...
*/
WARN_ON_ONCE(!ret);
@@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
- return __dax_invalidate_mapping_entry(mapping, index, false);
+ return __dax_invalidate_entry(mapping, index, false);
}
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
@@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_mapping_entry(struct address_space *mapping,
- struct vm_fault *vmf,
- void *entry, pfn_t pfn_t,
- unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas,
+ struct address_space *mapping, struct vm_fault *vmf,
+ void *entry, pfn_t pfn, unsigned long flags, bool dirty)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- unsigned long pfn = pfn_t_to_pfn(pfn_t);
- pgoff_t index = vmf->pgoff;
- void *new_entry;
+ void *new_entry = dax_make_entry(pfn, flags);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
+ PG_PMD_NR, false);
else /* pte entry */
- unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
+ unmap_mapping_pages(mapping, index, 1, false);
}
- xa_lock_irq(pages);
- new_entry = dax_radix_locked_entry(pfn, flags);
+ xas_reset(xas);
+ xas_lock_irq(xas);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
@@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
- * Only swap our new entry into the radix tree if the current
+ * Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
- * PMD entry is already in the tree, we leave it alone. This
+ * PMD entry is already in the cache, we leave it alone. This
* means that if we are trying to insert a PTE and the
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
- struct radix_tree_node *node;
- void **slot;
- void *ret;
-
- ret = __radix_tree_lookup(pages, index, &node, &slot);
- WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(pages, node, slot,
- new_entry, NULL);
+ void *old = dax_lock_entry(xas, new_entry);
+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
+ DAX_LOCKED));
entry = new_entry;
+ } else {
+ xas_load(xas); /* Walk the xa_state */
}
if (dirty)
- radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
+ xas_unlock_irq(xas);
return entry;
}
-static inline unsigned long
-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+static inline
+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
{
unsigned long address;
@@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
}
/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_mapping_entry_mkclean(struct address_space *mapping,
- pgoff_t index, unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
+ unsigned long pfn)
{
struct vm_area_struct *vma;
pte_t pte, *ptep = NULL;
@@ -937,11 +808,9 @@ unlock_pte:
i_mmap_unlock_read(mapping);
}
-static int dax_writeback_one(struct dax_device *dax_dev,
- struct address_space *mapping, pgoff_t index, void *entry)
+static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+ struct address_space *mapping, void *entry)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- void *entry2, **slot;
unsigned long pfn;
long ret = 0;
size_t size;
@@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* A page got tagged dirty in DAX mapping? Something is seriously
* wrong.
*/
- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ if (WARN_ON(!xa_is_value(entry)))
return -EIO;
- xa_lock_irq(pages);
- entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
- /* Entry got punched out / reallocated? */
- if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
- goto put_unlocked;
- /*
- * Entry got reallocated elsewhere? No need to writeback. We have to
- * compare pfns as we must not bail out due to difference in lockbit
- * or entry type.
- */
- if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
- goto put_unlocked;
- if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
- dax_is_zero_entry(entry))) {
- ret = -EIO;
- goto put_unlocked;
+ if (unlikely(dax_is_locked(entry))) {
+ void *old_entry = entry;
+
+ entry = get_unlocked_entry(xas);
+
+ /* Entry got punched out / reallocated? */
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback.
+ * We have to compare pfns as we must not bail out due to
+ * difference in lockbit or entry type.
+ */
+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
+ ret = -EIO;
+ goto put_unlocked;
+ }
+
+ /* Another fsync thread may have already done this entry */
+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
}
- /* Another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
- goto put_unlocked;
/* Lock the entry to serialize with page faults */
- entry = lock_slot(mapping, slot);
+ dax_lock_entry(xas, entry);
+
/*
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
@@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* at the entry only under the i_pages lock and once they do that
* they will see the entry locked and wait for it to unlock.
*/
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
- xa_unlock_irq(pages);
+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irq(xas);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
@@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
- pfn = dax_radix_pfn(entry);
- size = PAGE_SIZE << dax_radix_order(entry);
+ pfn = dax_to_pfn(entry);
+ size = PAGE_SIZE << dax_entry_order(entry);
- dax_mapping_entry_mkclean(mapping, index, pfn);
+ dax_entry_mkclean(mapping, xas->xa_index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
- xa_lock_irq(pages);
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
- trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
- put_locked_mapping_entry(mapping, index);
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ xas_store(xas, entry);
+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+ dax_wake_entry(xas, entry, false);
+
+ trace_dax_writeback_one(mapping->host, xas->xa_index,
+ size >> PAGE_SHIFT);
return ret;
put_unlocked:
- put_unlocked_mapping_entry(mapping, index, entry2);
- xa_unlock_irq(pages);
+ put_unlocked_entry(xas, entry);
return ret;
}
@@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
+ XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
struct inode *inode = mapping->host;
- pgoff_t start_index, end_index;
- pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
struct dax_device *dax_dev;
- struct pagevec pvec;
- bool done = false;
- int i, ret = 0;
+ void *entry;
+ int ret = 0;
+ unsigned int scanned = 0;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
@@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!dax_dev)
return -EIO;
- start_index = wbc->range_start >> PAGE_SHIFT;
- end_index = wbc->range_end >> PAGE_SHIFT;
-
- trace_dax_writeback_range(inode, start_index, end_index);
+ trace_dax_writeback_range(inode, xas.xa_index, end_index);
- tag_pages_for_writeback(mapping, start_index, end_index);
+ tag_pages_for_writeback(mapping, xas.xa_index, end_index);
- pagevec_init(&pvec);
- while (!done) {
- pvec.nr = find_get_entries_tag(mapping, start_index,
- PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
- pvec.pages, indices);
-
- if (pvec.nr == 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
+ ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
+ if (ret < 0) {
+ mapping_set_error(mapping, ret);
break;
-
- for (i = 0; i < pvec.nr; i++) {
- if (indices[i] > end_index) {
- done = true;
- break;
- }
-
- ret = dax_writeback_one(dax_dev, mapping, indices[i],
- pvec.pages[i]);
- if (ret < 0) {
- mapping_set_error(mapping, ret);
- goto out;
- }
}
- start_index = indices[pvec.nr - 1] + 1;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
-out:
+ xas_unlock_irq(&xas);
put_dax(dax_dev);
- trace_dax_writeback_range_done(inode, start_index, end_index);
- return (ret < 0 ? ret : 0);
+ trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
+ return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -1125,16 +990,18 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas,
+ struct address_space *mapping, void **entry,
+ struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
- false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_ZERO_PAGE, false);
+
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- ret = dax_fault_return(PTR_ERR(entry));
+ entry = grab_mapping_entry(&xas, mapping, 0);
+ if (xa_is_internal(entry)) {
+ ret = xa_to_internal(entry);
goto out;
}
@@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto error_finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync);
/*
@@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!write) {
- ret = dax_load_hole(mapping, entry, vmf);
+ ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff);
+ dax_unlock_entry(&xas, entry);
out:
trace_dax_pte_fault_done(inode, vmf, ret);
return ret | major;
}
#ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void *entry)
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ struct iomap *iomap, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct inode *inode = mapping->host;
struct page *zero_page;
- void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
pfn_t pfn;
@@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
return VM_FAULT_NOPAGE;
fallback:
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK;
}
@@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
@@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
- pgoff_t max_pgoff, pgoff;
+ pgoff_t max_pgoff;
void *entry;
loff_t pos;
int error;
@@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
- pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* Make sure that the faulting address's PMD offset (color) matches
* the PMD offset from the start of the file. This is necessary so
* that a PMD range in the page table overlaps exactly with a PMD
- * range in the radix tree.
+ * range in the page cache.
*/
if ((vmf->pgoff & PG_PMD_COLOUR) !=
((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
@@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- if (pgoff >= max_pgoff) {
+ if (xas.xa_index >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
- if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
+ if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
- * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
- * is already in the tree, for instance), it will return -EEXIST and
- * we just fall back to 4k entries.
+ * grab_mapping_entry() will make sure we get an empty PMD entry,
+ * a zero PMD entry or a DAX PMD. If it can't (because a PTE
+ * entry is already in the array, for instance), it will return
+ * VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
+ entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+ if (xa_is_internal(entry)) {
+ result = xa_to_internal(entry);
goto fallback;
+ }
/*
* It is possible, particularly with mixed reads & writes to private
@@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
*/
- pos = (loff_t)pgoff << PAGE_SHIFT;
+ pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
goto unlock_entry;
@@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD, write && !sync);
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
+ DAX_PMD, write && !sync);
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, entry);
+ result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff);
+ dax_unlock_entry(&xas, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
-/**
+/*
* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
* @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
+ * @order: Order of entry to insert.
*
- * This function inserts writeable PTE or PMD entry into page tables for mmaped
- * DAX file. It takes care of marking corresponding radix tree entry as dirty
- * as well.
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
- enum page_entry_size pe_size,
- pfn_t pfn)
+static vm_fault_t
+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ void *entry;
vm_fault_t ret;
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
/* Did we race with someone splitting entry or so? */
if (!entry ||
- (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
- (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
+ (order == 0 && !dax_is_pte_entry(entry)) ||
+ (order == PMD_ORDER && (xa_is_internal(entry) ||
+ !dax_is_pmd_entry(entry)))) {
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
- radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- switch (pe_size) {
- case PE_SIZE_PTE:
+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- break;
#ifdef CONFIG_FS_DAX_PMD
- case PE_SIZE_PMD:
+ else if (order == PMD_ORDER)
ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
- break;
#endif
- default:
+ else
ret = VM_FAULT_FALLBACK;
- }
- put_locked_mapping_entry(mapping, index);
+ dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
}
@@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
- size_t len = 0;
+ unsigned int order = pe_order(pe_size);
+ size_t len = PAGE_SIZE << order;
- if (pe_size == PE_SIZE_PTE)
- len = PAGE_SIZE;
- else if (pe_size == PE_SIZE_PMD)
- len = PMD_SIZE;
- else
- WARN_ON_ONCE(1);
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
if (err)
return VM_FAULT_SIGBUS;
- return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+ return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d85e9b4..2593153471cf 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,7 +26,7 @@
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
@@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
-static void __d_free_external_name(struct rcu_head *head)
-{
- struct external_name *name = container_of(head, struct external_name,
- u.head);
-
- mod_node_page_state(page_pgdat(virt_to_page(name)),
- NR_INDIRECTLY_RECLAIMABLE_BYTES,
- -ksize(name));
-
- kfree(name);
-}
-
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
-
- __d_free_external_name(&external_name(dentry)->u.head);
-
+ kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
}
@@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name)
struct external_name *p;
p = container_of(name->name, struct external_name, name[0]);
if (unlikely(atomic_dec_and_test(&p->u.count)))
- call_rcu(&p->u.head, __d_free_external_name);
+ kfree_rcu(p, u.head);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
@@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate);
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
- struct external_name *ext = NULL;
struct dentry *dentry;
char *dname;
int err;
@@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dname = dentry->d_iname;
} else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
-
- ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT);
- if (!ext) {
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT |
+ __GFP_RECLAIMABLE);
+ if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
- atomic_set(&ext->u.count, 1);
- dname = ext->name;
+ atomic_set(&p->u.count, 1);
+ dname = p->name;
} else {
dname = dentry->d_iname;
}
@@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
}
}
- if (unlikely(ext)) {
- pg_data_t *pgdat = page_pgdat(virt_to_page(ext));
- mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES,
- ksize(ext));
- }
-
this_cpu_inc(nr_dentry);
return dentry;
@@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
dentry->d_name.hash_len = target->d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
- call_rcu(&old_name->u.head, __d_free_external_name);
+ kfree_rcu(old_name, u.head);
}
/*
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 49121e5a8de2..5c36ceecb5c1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -593,11 +593,16 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
lower_new_dir_dentry = dget_parent(lower_new_dentry);
target_inode = d_inode(new_dentry);
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ rc = -EINVAL;
+ if (lower_old_dentry->d_parent != lower_old_dir_dentry)
+ goto out_lock;
+ if (lower_new_dentry->d_parent != lower_new_dir_dentry)
+ goto out_lock;
+ if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry))
+ goto out_lock;
/* source should not be ancestor of target */
- if (trap == lower_old_dentry) {
- rc = -EINVAL;
+ if (trap == lower_old_dentry)
goto out_lock;
- }
/* target should not be ancestor of source */
if (trap == lower_new_dentry) {
rc = -ENOTEMPTY;
diff --git a/fs/exec.c b/fs/exec.c
index 1ebf6e5a521d..fc281b738a98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -908,14 +908,14 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
goto out;
i_size = i_size_read(file_inode(file));
- if (max_size > 0 && i_size > max_size) {
- ret = -EFBIG;
- goto out;
- }
if (i_size <= 0) {
ret = -EINVAL;
goto out;
}
+ if (i_size > SIZE_MAX || (max_size > 0 && i_size > max_size)) {
+ ret = -EFBIG;
+ goto out;
+ }
if (id != READING_FIRMWARE_PREALLOC_BUFFER)
*buf = vmalloc(i_size);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 224c04abb2e5..cf4c77f8dd08 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -256,11 +256,15 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
if (default_acl) {
error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 00e759f05161..e770cd100a6a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -390,11 +390,7 @@ struct ext2_inode {
#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
-#ifdef CONFIG_FS_DAX
#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
-#else
-#define EXT2_MOUNT_DAX 0
-#endif
#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 73bd58fa13de..cb91baa4275d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -309,20 +309,17 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
-#if defined(CONFIG_QUOTA)
if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
seq_puts(seq, ",usrquota");
if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
seq_puts(seq, ",grpquota");
-#endif
-#ifdef CONFIG_FS_DAX
if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
seq_puts(seq, ",xip");
+
if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
seq_puts(seq, ",dax");
-#endif
if (!test_opt(sb, RESERVATION))
seq_puts(seq, ",noreservation");
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index fb50f9aa6ead..c1d570ee1d9f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
default_acl, XATTR_CREATE);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
acl, XATTR_CREATE);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index caff935fbeb8..3f89d0ab08fc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -45,15 +45,6 @@
#include <linux/compiler.h>
-/* Until this gets included into linux/compiler-gcc.h */
-#ifndef __nonstring
-#if defined(GCC_VERSION) && (GCC_VERSION >= 80000)
-#define __nonstring __attribute__((nonstring))
-#else
-#define __nonstring
-#endif
-#endif
-
/*
* The fourth extended filesystem constants/structures
*/
@@ -628,6 +619,7 @@ enum {
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040
/*
* ioctl commands
@@ -1030,6 +1022,9 @@ struct ext4_inode_info {
ext4_lblk_t i_da_metadata_calc_last_lblock;
int i_da_metadata_calc_len;
+ /* pending cluster reservations for bigalloc file systems */
+ struct ext4_pending_tree i_pending_tree;
+
/* on-disk additional length */
__u16 i_extra_isize;
@@ -1401,7 +1396,8 @@ struct ext4_sb_info {
u32 s_min_batch_time;
struct block_device *journal_bdev;
#ifdef CONFIG_QUOTA
- char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */
+ /* Names of quota files with journalled quota */
+ char __rcu *s_qf_names[EXT4_MAXQUOTAS];
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -2483,10 +2479,11 @@ extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_fault *vmf);
+extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
+extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
@@ -3142,10 +3139,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_range(struct inode *inode,
- ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
@@ -3156,6 +3149,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
+extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index adf6668b596f..98bd0e9ee7df 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -120,6 +120,19 @@ struct ext4_ext_path {
};
/*
+ * Used to record a portion of a cluster found at the beginning or end
+ * of an extent while traversing the extent tree during space removal.
+ * A partial cluster may be removed if it does not contain blocks shared
+ * with extents that aren't being deleted (tofree state). Otherwise,
+ * it cannot be removed (nofree state).
+ */
+struct partial_cluster {
+ ext4_fsblk_t pclu; /* physical cluster number */
+ ext4_lblk_t lblk; /* logical block number within logical cluster */
+ enum {initial, tofree, nofree} state;
+};
+
+/*
* structure for external API
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72a361d5ef74..240b6dea5441 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
{
struct extent_status es;
- ext4_es_find_delayed_extent_range(inode, hole_start,
- hole_start + hole_len - 1, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+ hole_start + hole_len - 1, &es);
if (es.es_len) {
/* There's delayed extent containing lblock? */
if (es.es_lblk <= hole_start)
@@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode)
return 0;
}
+/*
+ * ext4_rereserve_cluster - increment the reserved cluster count when
+ * freeing a cluster with a pending reservation
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in cluster to be reserved
+ *
+ * Increments the reserved cluster count and adjusts quota in a bigalloc
+ * file system when freeing a partial cluster containing at least one
+ * delayed and unwritten block. A partial cluster meeting that
+ * requirement will have a pending reservation. If so, the
+ * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
+ * defer reserved and allocated space accounting to a subsequent call
+ * to this function.
+ */
+static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
+
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks++;
+ percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ percpu_counter_add(&sbi->s_freeclusters_counter, 1);
+ ext4_remove_pending(inode, lblk);
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
- long long *partial_cluster,
+ struct partial_cluster *partial,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
- ext4_fsblk_t pblk;
- int flags = get_default_free_blocks_flags(inode);
+ ext4_fsblk_t last_pblk, pblk;
+ ext4_lblk_t num;
+ int flags;
+
+ /* only extent tail removal is allowed */
+ if (from < le32_to_cpu(ex->ee_block) ||
+ to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
+ ext4_error(sbi->s_sb,
+ "strange request: removal(2) %u-%u from %u:%u",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
+ return 0;
+ }
+
+#ifdef EXTENTS_STATS
+ spin_lock(&sbi->s_ext_stats_lock);
+ sbi->s_ext_blocks += ee_len;
+ sbi->s_ext_extents++;
+ if (ee_len < sbi->s_ext_min)
+ sbi->s_ext_min = ee_len;
+ if (ee_len > sbi->s_ext_max)
+ sbi->s_ext_max = ee_len;
+ if (ext_depth(inode) > sbi->s_depth_max)
+ sbi->s_depth_max = ext_depth(inode);
+ spin_unlock(&sbi->s_ext_stats_lock);
+#endif
+
+ trace_ext4_remove_blocks(inode, ex, from, to, partial);
/*
- * For bigalloc file systems, we never free a partial cluster
- * at the beginning of the extent. Instead, we make a note
- * that we tried freeing the cluster, and check to see if we
- * need to free it on a subsequent call to ext4_remove_blocks,
- * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
+ * if we have a partial cluster, and it's different from the
+ * cluster of the last block in the extent, we free it
*/
- flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+ last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
+
+ if (partial->state != initial &&
+ partial->pclu != EXT4_B2C(sbi, last_pblk)) {
+ if (partial->state == tofree) {
+ flags = get_default_free_blocks_flags(inode);
+ if (ext4_is_pending(inode, partial->lblk))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, partial->pclu),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, partial->lblk);
+ }
+ partial->state = initial;
+ }
+
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ pblk = ext4_ext_pblock(ex) + ee_len - num;
- trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
/*
- * If we have a partial cluster, and it's different from the
- * cluster of the last block, we need to explicitly free the
- * partial cluster here.
+ * We free the partial cluster at the end of the extent (if any),
+ * unless the cluster is used by another extent (partial_cluster
+ * state is nofree). If a partial cluster exists here, it must be
+ * shared with the last block in the extent.
*/
- pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if (*partial_cluster > 0 &&
- *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+ flags = get_default_free_blocks_flags(inode);
+
+ /* partial, left end cluster aligned, right end unaligned */
+ if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
+ (EXT4_LBLK_CMASK(sbi, to) >= from) &&
+ (partial->state != nofree)) {
+ if (ext4_is_pending(inode, to))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, *partial_cluster),
+ EXT4_PBLK_CMASK(sbi, last_pblk),
sbi->s_cluster_ratio, flags);
- *partial_cluster = 0;
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, to);
+ partial->state = initial;
+ flags = get_default_free_blocks_flags(inode);
}
-#ifdef EXTENTS_STATS
- {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- spin_lock(&sbi->s_ext_stats_lock);
- sbi->s_ext_blocks += ee_len;
- sbi->s_ext_extents++;
- if (ee_len < sbi->s_ext_min)
- sbi->s_ext_min = ee_len;
- if (ee_len > sbi->s_ext_max)
- sbi->s_ext_max = ee_len;
- if (ext_depth(inode) > sbi->s_depth_max)
- sbi->s_depth_max = ext_depth(inode);
- spin_unlock(&sbi->s_ext_stats_lock);
- }
-#endif
- if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
- /* tail removal */
- ext4_lblk_t num;
- long long first_cluster;
-
- num = le32_to_cpu(ex->ee_block) + ee_len - from;
- pblk = ext4_ext_pblock(ex) + ee_len - num;
- /*
- * Usually we want to free partial cluster at the end of the
- * extent, except for the situation when the cluster is still
- * used by any other extent (partial_cluster is negative).
- */
- if (*partial_cluster < 0 &&
- *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
- flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
- ext_debug("free last %u blocks starting %llu partial %lld\n",
- num, pblk, *partial_cluster);
- ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
- /*
- * If the block range to be freed didn't start at the
- * beginning of a cluster, and we removed the entire
- * extent and the cluster is not used by any other extent,
- * save the partial cluster here, since we might need to
- * delete if we determine that the truncate or punch hole
- * operation has removed all of the blocks in the cluster.
- * If that cluster is used by another extent, preserve its
- * negative value so it isn't freed later on.
- *
- * If the whole extent wasn't freed, we've reached the
- * start of the truncated/punched region and have finished
- * removing blocks. If there's a partial cluster here it's
- * shared with the remainder of the extent and is no longer
- * a candidate for removal.
- */
- if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
- first_cluster = (long long) EXT4_B2C(sbi, pblk);
- if (first_cluster != -*partial_cluster)
- *partial_cluster = first_cluster;
- } else {
- *partial_cluster = 0;
+ /*
+ * For bigalloc file systems, we never free a partial cluster
+ * at the beginning of the extent. Instead, we check to see if we
+ * need to free it on a subsequent call to ext4_remove_blocks,
+ * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
+ */
+ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+ ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+
+ /* reset the partial cluster if we've freed past it */
+ if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
+ partial->state = initial;
+
+ /*
+ * If we've freed the entire extent but the beginning is not left
+ * cluster aligned and is not marked as ineligible for freeing we
+ * record the partial cluster at the beginning of the extent. It
+ * wasn't freed by the preceding ext4_free_blocks() call, and we
+ * need to look farther to the left to determine if it's to be freed
+ * (not shared with another extent). Else, reset the partial
+ * cluster - we're either done freeing or the beginning of the
+ * extent is left cluster aligned.
+ */
+ if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
+ if (partial->state == initial) {
+ partial->pclu = EXT4_B2C(sbi, pblk);
+ partial->lblk = from;
+ partial->state = tofree;
}
- } else
- ext4_error(sbi->s_sb, "strange request: removal(2) "
- "%u-%u from %u:%u",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
+ } else {
+ partial->state = initial;
+ }
+
return 0;
}
-
/*
* ext4_ext_rm_leaf() Removes the extents associated with the
* blocks appearing between "start" and "end". Both "start"
@@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- long long *partial_cluster,
+ struct partial_cluster *partial,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
- trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+ trace_ext4_ext_rm_leaf(inode, start, ex, partial);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
*/
if (sbi->s_cluster_ratio > 1) {
pblk = ext4_ext_pblock(ex);
- *partial_cluster =
- -(long long) EXT4_B2C(sbi, pblk);
+ partial->pclu = EXT4_B2C(sbi, pblk);
+ partial->state = nofree;
}
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
- err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
- a, b);
+ err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
if (err)
goto out;
@@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* If there's a partial cluster and at least one extent remains in
* the leaf, free the partial cluster if it isn't shared with the
* current extent. If it is shared with the current extent
- * we zero partial_cluster because we've reached the start of the
+ * we reset the partial cluster because we've reached the start of the
* truncated/punched region and we're done removing blocks.
*/
- if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+ if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
- if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+ if (partial->pclu != EXT4_B2C(sbi, pblk)) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ if (ext4_is_pending(inode, partial->lblk))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, *partial_cluster),
- sbi->s_cluster_ratio,
- get_default_free_blocks_flags(inode));
+ EXT4_C2B(sbi, partial->pclu),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, partial->lblk);
}
- *partial_cluster = 0;
+ partial->state = initial;
}
/* if this leaf is free, then we should
@@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
- long long partial_cluster = 0;
+ struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ partial.pclu = 0;
+ partial.lblk = 0;
+ partial.state = initial;
+
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
@@ -2882,8 +2941,8 @@ again:
*/
if (sbi->s_cluster_ratio > 1) {
pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
- partial_cluster =
- -(long long) EXT4_B2C(sbi, pblk);
+ partial.pclu = EXT4_B2C(sbi, pblk);
+ partial.state = nofree;
}
/*
@@ -2911,9 +2970,10 @@ again:
&ex);
if (err)
goto out;
- if (pblk)
- partial_cluster =
- -(long long) EXT4_B2C(sbi, pblk);
+ if (pblk) {
+ partial.pclu = EXT4_B2C(sbi, pblk);
+ partial.state = nofree;
+ }
}
}
/*
@@ -2948,8 +3008,7 @@ again:
if (i == depth) {
/* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path,
- &partial_cluster, start,
- end);
+ &partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -3021,21 +3080,24 @@ again:
}
}
- trace_ext4_ext_remove_space_done(inode, start, end, depth,
- partial_cluster, path->p_hdr->eh_entries);
+ trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
+ path->p_hdr->eh_entries);
/*
- * If we still have something in the partial cluster and we have removed
- * even the first extent, then we should free the blocks in the partial
- * cluster as well. (This code will only run when there are no leaves
- * to the immediate left of the truncated/punched region.)
+ * if there's a partial cluster and we have removed the first extent
+ * in the file, then we also free the partial cluster, if any
*/
- if (partial_cluster > 0 && err == 0) {
- /* don't zero partial_cluster since it's not used afterwards */
+ if (partial.state == tofree && err == 0) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ if (ext4_is_pending(inode, partial.lblk))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, partial_cluster),
- sbi->s_cluster_ratio,
- get_default_free_blocks_flags(inode));
+ EXT4_C2B(sbi, partial.pclu),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, partial.lblk);
+ partial.state = initial;
}
/* TODO: flexible tree reduction should be here */
@@ -3819,114 +3881,6 @@ out:
return ext4_mark_inode_dirty(handle, inode);
}
-/**
- * ext4_find_delalloc_range: find delayed allocated block in the given range.
- *
- * Return 1 if there is a delalloc block in the range, otherwise 0.
- */
-int ext4_find_delalloc_range(struct inode *inode,
- ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end)
-{
- struct extent_status es;
-
- ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
- if (es.es_len == 0)
- return 0; /* there is no delay extent in this tree */
- else if (es.es_lblk <= lblk_start &&
- lblk_start < es.es_lblk + es.es_len)
- return 1;
- else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
- return 1;
- else
- return 0;
-}
-
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
-{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- ext4_lblk_t lblk_start, lblk_end;
- lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
- lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
-
- return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
-}
-
-/**
- * Determines how many complete clusters (out of those specified by the 'map')
- * are under delalloc and were reserved quota for.
- * This function is called when we are writing out the blocks that were
- * originally written with their allocation delayed, but then the space was
- * allocated using fallocate() before the delayed allocation could be resolved.
- * The cases to look for are:
- * ('=' indicated delayed allocated blocks
- * '-' indicates non-delayed allocated blocks)
- * (a) partial clusters towards beginning and/or end outside of allocated range
- * are not delalloc'ed.
- * Ex:
- * |----c---=|====c====|====c====|===-c----|
- * |++++++ allocated ++++++|
- * ==> 4 complete clusters in above example
- *
- * (b) partial cluster (outside of allocated range) towards either end is
- * marked for delayed allocation. In this case, we will exclude that
- * cluster.
- * Ex:
- * |----====c========|========c========|
- * |++++++ allocated ++++++|
- * ==> 1 complete clusters in above example
- *
- * Ex:
- * |================c================|
- * |++++++ allocated ++++++|
- * ==> 0 complete clusters in above example
- *
- * The ext4_da_update_reserve_space will be called only if we
- * determine here that there were some "entire" clusters that span
- * this 'allocated' range.
- * In the non-bigalloc case, this function will just end up returning num_blks
- * without ever calling ext4_find_delalloc_range.
- */
-static unsigned int
-get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
- unsigned int num_blks)
-{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
- ext4_lblk_t lblk_from, lblk_to, c_offset;
- unsigned int allocated_clusters = 0;
-
- alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
- alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
-
- /* max possible clusters for this allocation */
- allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
-
- trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
-
- /* Check towards left side */
- c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
- if (c_offset) {
- lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
- lblk_to = lblk_from + c_offset - 1;
-
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
- allocated_clusters--;
- }
-
- /* Now check towards right. */
- c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
- if (allocated_clusters && c_offset) {
- lblk_from = lblk_start + num_blks;
- lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
-
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
- allocated_clusters--;
- }
-
- return allocated_clusters;
-}
-
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
@@ -4108,23 +4062,6 @@ out:
}
map->m_len = allocated;
- /*
- * If we have done fallocate with the offset that is already
- * delayed allocated, we would have block reservation
- * and quota reservation done in the delayed write path.
- * But fallocate would have already updated quota and block
- * count for this offset. So cancel these reservation
- */
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- unsigned int reserved_clusters;
- reserved_clusters = get_reserved_cluster_alloc(inode,
- map->m_lblk, map->m_len);
- if (reserved_clusters)
- ext4_da_update_reserve_space(inode,
- reserved_clusters,
- 0);
- }
-
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4513,77 +4450,39 @@ got_allocated_blocks:
map->m_flags |= EXT4_MAP_NEW;
/*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now.
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
*/
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- unsigned int reserved_clusters;
- /*
- * Check how many clusters we had reserved this allocated range
- */
- reserved_clusters = get_reserved_cluster_alloc(inode,
- map->m_lblk, allocated);
- if (!map_from_cluster) {
- BUG_ON(allocated_clusters < reserved_clusters);
- if (reserved_clusters < allocated_clusters) {
- struct ext4_inode_info *ei = EXT4_I(inode);
- int reservation = allocated_clusters -
- reserved_clusters;
- /*
- * It seems we claimed few clusters outside of
- * the range of this allocation. We should give
- * it back to the reservation pool. This can
- * happen in the following case:
- *
- * * Suppose s_cluster_ratio is 4 (i.e., each
- * cluster has 4 blocks. Thus, the clusters
- * are [0-3],[4-7],[8-11]...
- * * First comes delayed allocation write for
- * logical blocks 10 & 11. Since there were no
- * previous delayed allocated blocks in the
- * range [8-11], we would reserve 1 cluster
- * for this write.
- * * Next comes write for logical blocks 3 to 8.
- * In this case, we will reserve 2 clusters
- * (for [0-3] and [4-7]; and not for [8-11] as
- * that range has a delayed allocated blocks.
- * Thus total reserved clusters now becomes 3.
- * * Now, during the delayed allocation writeout
- * time, we will first write blocks [3-8] and
- * allocate 3 clusters for writing these
- * blocks. Also, we would claim all these
- * three clusters above.
- * * Now when we come here to writeout the
- * blocks [10-11], we would expect to claim
- * the reservation of 1 cluster we had made
- * (and we would claim it since there are no
- * more delayed allocated blocks in the range
- * [8-11]. But our reserved cluster count had
- * already gone to 0.
- *
- * Thus, at the step 4 above when we determine
- * that there are still some unwritten delayed
- * allocated blocks outside of our current
- * block range, we should increment the
- * reserved clusters count so that when the
- * remaining blocks finally gets written, we
- * could claim them.
- */
- dquot_reserve_block(inode,
- EXT4_C2B(sbi, reservation));
- spin_lock(&ei->i_block_reservation_lock);
- ei->i_reserved_data_blocks += reservation;
- spin_unlock(&ei->i_block_reservation_lock);
- }
+ if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
/*
- * We will claim quota for all newly allocated blocks.
- * We're updating the reserved space *after* the
- * correction above so we do not accidentally free
- * all the metadata reservation because we might
- * actually need it later on.
+ * When allocating delayed allocated clusters, simply
+ * reduce the reserved cluster count and claim quota
*/
ext4_da_update_reserve_space(inode, allocated_clusters,
1);
+ } else {
+ ext4_lblk_t lblk, len;
+ unsigned int n;
+
+ /*
+ * When allocating non-delayed allocated clusters
+ * (from fallocate, filemap, DIO, or clusters
+ * allocated when delalloc has been disabled by
+ * ext4_nonda_switch), reduce the reserved cluster
+ * count by the number of allocated clusters that
+ * have previously been delayed allocated. Quota
+ * has been claimed by ext4_mb_new_blocks() above,
+ * so release the quota reservations made for any
+ * previously delayed allocated clusters.
+ */
+ lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+ len = allocated_clusters << sbi->s_cluster_bits;
+ n = ext4_es_delayed_clu(inode, lblk, len);
+ if (n > 0)
+ ext4_da_update_reserve_space(inode, (int) n, 0);
}
}
@@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
ext4_lblk_t block, next_del;
if (newes->es_pblk == 0) {
- ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
- newes->es_lblk + newes->es_len - 1, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ newes->es_lblk,
+ newes->es_lblk + newes->es_len - 1,
+ &es);
/*
* No extent in extent-tree contains block @newes->es_pblk,
@@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode,
}
block = newes->es_lblk + newes->es_len;
- ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
+ EXT_MAX_BLOCKS, &es);
if (es.es_len == 0)
next_del = EXT_MAX_BLOCKS;
else
@@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
}
return replaced_count;
}
+
+/*
+ * ext4_clu_mapped - determine whether any block in a logical cluster has
+ * been mapped to a physical cluster
+ *
+ * @inode - file containing the logical cluster
+ * @lclu - logical cluster of interest
+ *
+ * Returns 1 if any block in the logical cluster is mapped, signifying
+ * that a physical cluster has been allocated for it. Otherwise,
+ * returns 0. Can also return negative error codes. Derived from
+ * ext4_ext_map_blocks().
+ */
+int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_ext_path *path;
+ int depth, mapped = 0, err = 0;
+ struct ext4_extent *extent;
+ ext4_lblk_t first_lblk, first_lclu, last_lclu;
+
+ /* search for the extent closest to the first block in the cluster */
+ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out;
+ }
+
+ depth = ext_depth(inode);
+
+ /*
+ * A consistent leaf must not be empty. This situation is possible,
+ * though, _during_ tree modification, and it's why an assert can't
+ * be put in ext4_find_extent().
+ */
+ if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+ EXT4_ERROR_INODE(inode,
+ "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
+ (unsigned long) EXT4_C2B(sbi, lclu),
+ depth, path[depth].p_block);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
+ extent = path[depth].p_ext;
+
+ /* can't be mapped if the extent tree is empty */
+ if (extent == NULL)
+ goto out;
+
+ first_lblk = le32_to_cpu(extent->ee_block);
+ first_lclu = EXT4_B2C(sbi, first_lblk);
+
+ /*
+ * Three possible outcomes at this point - found extent spanning
+ * the target cluster, to the left of the target cluster, or to the
+ * right of the target cluster. The first two cases are handled here.
+ * The last case indicates the target cluster is not mapped.
+ */
+ if (lclu >= first_lclu) {
+ last_lclu = EXT4_B2C(sbi, first_lblk +
+ ext4_ext_get_actual_len(extent) - 1);
+ if (lclu <= last_lclu) {
+ mapped = 1;
+ } else {
+ first_lblk = ext4_ext_next_allocated_block(path);
+ first_lclu = EXT4_B2C(sbi, first_lblk);
+ if (lclu == first_lclu)
+ mapped = 1;
+ }
+ }
+
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ return err ? err : mapped;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c4e6fb15101b..2b439afafe13 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -142,6 +142,7 @@
*/
static struct kmem_cache *ext4_es_cachep;
+static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
@@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
int __init ext4_init_es(void)
{
@@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
- * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ * ext4_es_find_extent_range - find extent with specified status within block
+ * range or next extent following block range in
+ * extents status tree
*
- * @inode: the inode which owns delayed extents
- * @lblk: the offset where we start to search
- * @end: the offset where we stop to search
- * @es: delayed extent that we found
+ * @inode - file containing the range
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block defining start of range
+ * @end - logical block defining end of range
+ * @es - extent found, if any
+ *
+ * Find the first extent within the block range specified by @lblk and @end
+ * in the extents status tree that satisfies @matching_fn. If a match
+ * is found, it's returned in @es. If not, and a matching extent is found
+ * beyond the block range, it's returned in @es. If no match is found, an
+ * extent is returned in @es whose es_lblk, es_len, and es_pblk components
+ * are 0.
*/
-void ext4_es_find_delayed_extent_range(struct inode *inode,
- ext4_lblk_t lblk, ext4_lblk_t end,
- struct extent_status *es)
+static void __es_find_extent_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
- BUG_ON(es == NULL);
- BUG_ON(end < lblk);
- trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+ WARN_ON(es == NULL);
+ WARN_ON(end < lblk);
- read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
- /* find extent in cache firstly */
+ /* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
if (tree->cache_es) {
es1 = tree->cache_es;
@@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
es1 = __es_tree_search(&tree->root, lblk);
out:
- if (es1 && !ext4_es_is_delayed(es1)) {
+ if (es1 && !matching_fn(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
if (es1->es_lblk > end) {
es1 = NULL;
break;
}
- if (ext4_es_is_delayed(es1))
+ if (matching_fn(es1))
break;
}
}
- if (es1 && ext4_es_is_delayed(es1)) {
+ if (es1 && matching_fn(es1)) {
tree->cache_es = es1;
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
}
+}
+
+/*
+ * Locking for __es_find_extent_range() for external use
+ */
+void ext4_es_find_extent_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
+{
+ trace_ext4_es_find_extent_range_enter(inode, lblk);
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ __es_find_extent_range(inode, matching_fn, lblk, end, es);
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_find_extent_range_exit(inode, es);
+}
+
+/*
+ * __es_scan_range - search block range for block with specified status
+ * in extents status tree
+ *
+ * @inode - file containing the range
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns true if at least one block in the specified block range satisfies
+ * the criterion specified by @matching_fn, and false if not. If at least
+ * one extent has the specified status, then there is at least one block
+ * in the cluster with that status. Should only be called by code that has
+ * taken i_es_lock.
+ */
+static bool __es_scan_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t start, ext4_lblk_t end)
+{
+ struct extent_status es;
+
+ __es_find_extent_range(inode, matching_fn, start, end, &es);
+ if (es.es_len == 0)
+ return false; /* no matching extent in the tree */
+ else if (es.es_lblk <= start &&
+ start < es.es_lblk + es.es_len)
+ return true;
+ else if (start <= es.es_lblk && es.es_lblk <= end)
+ return true;
+ else
+ return false;
+}
+/*
+ * Locking for __es_scan_range() for external use
+ */
+bool ext4_es_scan_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end)
+{
+ bool ret;
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
- trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+ return ret;
+}
+
+/*
+ * __es_scan_clu - search cluster for block with specified status in
+ * extents status tree
+ *
+ * @inode - file containing the cluster
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block in cluster to be searched
+ *
+ * Returns true if at least one extent in the cluster containing @lblk
+ * satisfies the criterion specified by @matching_fn, and false if not. If at
+ * least one extent has the specified status, then there is at least one block
+ * in the cluster with that status. Should only be called by code that has
+ * taken i_es_lock.
+ */
+static bool __es_scan_clu(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t lblk_start, lblk_end;
+
+ lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+ lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+ return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
+}
+
+/*
+ * Locking for __es_scan_clu() for external use
+ */
+bool ext4_es_scan_clu(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk)
+{
+ bool ret;
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ ret = __es_scan_clu(inode, matching_fn, lblk);
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ return ret;
}
static void ext4_es_list_add(struct inode *inode)
@@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -730,6 +847,11 @@ retry:
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
err = 0;
+ if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
+ (status & EXTENT_STATUS_WRITTEN ||
+ status & EXTENT_STATUS_UNWRITTEN))
+ __revise_pending(inode, lblk, len);
+
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
ei->i_es_tree.cache_es = NULL;
return nr_shrunk;
}
+
+#ifdef ES_DEBUG__
+static void ext4_print_pending_tree(struct inode *inode)
+{
+ struct ext4_pending_tree *tree;
+ struct rb_node *node;
+ struct pending_reservation *pr;
+
+ printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_pending_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ printk(KERN_DEBUG " %u", pr->lclu);
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_print_pending_tree(inode)
+#endif
+
+int __init ext4_init_pending(void)
+{
+ ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
+ sizeof(struct pending_reservation),
+ 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ if (ext4_pending_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_pending(void)
+{
+ kmem_cache_destroy(ext4_pending_cachep);
+}
+
+void ext4_init_pending_tree(struct ext4_pending_tree *tree)
+{
+ tree->root = RB_ROOT;
+}
+
+/*
+ * __get_pending - retrieve a pointer to a pending reservation
+ *
+ * @inode - file containing the pending cluster reservation
+ * @lclu - logical cluster of interest
+ *
+ * Returns a pointer to a pending reservation if it's a member of
+ * the set, and NULL if not. Must be called holding i_es_lock.
+ */
+static struct pending_reservation *__get_pending(struct inode *inode,
+ ext4_lblk_t lclu)
+{
+ struct ext4_pending_tree *tree;
+ struct rb_node *node;
+ struct pending_reservation *pr = NULL;
+
+ tree = &EXT4_I(inode)->i_pending_tree;
+ node = (&tree->root)->rb_node;
+
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ if (lclu < pr->lclu)
+ node = node->rb_left;
+ else if (lclu > pr->lclu)
+ node = node->rb_right;
+ else if (lclu == pr->lclu)
+ return pr;
+ }
+ return NULL;
+}
+
+/*
+ * __insert_pending - adds a pending cluster reservation to the set of
+ * pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the cluster to be added
+ *
+ * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * pending reservation is already in the set, returns successfully.
+ */
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pending_reservation *pr;
+ ext4_lblk_t lclu;
+ int ret = 0;
+
+ lclu = EXT4_B2C(sbi, lblk);
+ /* search to find parent for insertion */
+ while (*p) {
+ parent = *p;
+ pr = rb_entry(parent, struct pending_reservation, rb_node);
+
+ if (lclu < pr->lclu) {
+ p = &(*p)->rb_left;
+ } else if (lclu > pr->lclu) {
+ p = &(*p)->rb_right;
+ } else {
+ /* pending reservation already inserted */
+ goto out;
+ }
+ }
+
+ pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+ if (pr == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pr->lclu = lclu;
+
+ rb_link_node(&pr->rb_node, parent, p);
+ rb_insert_color(&pr->rb_node, &tree->root);
+
+out:
+ return ret;
+}
+
+/*
+ * __remove_pending - removes a pending cluster reservation from the set
+ * of pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the pending cluster reservation to be removed
+ *
+ * Returns successfully if pending reservation is not a member of the set.
+ */
+static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree;
+
+ pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
+ if (pr != NULL) {
+ tree = &EXT4_I(inode)->i_pending_tree;
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ }
+}
+
+/*
+ * ext4_remove_pending - removes a pending cluster reservation from the set
+ * of pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the pending cluster reservation to be removed
+ *
+ * Locking for external use of __remove_pending.
+ */
+void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ write_lock(&ei->i_es_lock);
+ __remove_pending(inode, lblk);
+ write_unlock(&ei->i_es_lock);
+}
+
+/*
+ * ext4_is_pending - determine whether a cluster has a pending reservation
+ * on it
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the cluster
+ *
+ * Returns true if there's a pending reservation for the cluster in the
+ * set of pending reservations, and false if not.
+ */
+bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ bool ret;
+
+ read_lock(&ei->i_es_lock);
+ ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
+ read_unlock(&ei->i_es_lock);
+
+ return ret;
+}
+
+/*
+ * ext4_es_insert_delayed_block - adds a delayed block to the extents status
+ * tree, adding a pending reservation where
+ * needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ * @allocated - indicates whether a physical cluster has been allocated for
+ * the logical cluster that contains the block
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated)
+{
+ struct extent_status newes;
+ int err = 0;
+
+ es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
+ lblk, inode->i_ino);
+
+ newes.es_lblk = lblk;
+ newes.es_len = 1;
+ ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
+ trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+
+ ext4_es_insert_extent_check(inode, &newes);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ err = __es_remove_extent(inode, lblk, lblk);
+ if (err != 0)
+ goto error;
+retry:
+ err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+ 128, EXT4_I(inode)))
+ goto retry;
+ if (err != 0)
+ goto error;
+
+ if (allocated)
+ __insert_pending(inode, lblk);
+
+error:
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_print_tree(inode);
+ ext4_print_pending_tree(inode);
+
+ return err;
+}
+
+/*
+ * __es_delayed_clu - count number of clusters containing blocks that
+ * are delayed only
+ *
+ * @inode - file containing block range
+ * @start - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns the number of clusters containing only delayed (not delayed
+ * and unwritten) blocks in the range specified by @start and @end. Any
+ * cluster or part of a cluster within the range and containing a delayed
+ * and not unwritten block within the range is counted as a whole cluster.
+ */
+static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct rb_node *node;
+ ext4_lblk_t first_lclu, last_lclu;
+ unsigned long long last_counted_lclu;
+ unsigned int n = 0;
+
+ /* guaranteed to be unequal to any ext4_lblk_t value */
+ last_counted_lclu = ~0ULL;
+
+ es = __es_tree_search(&tree->root, start);
+
+ while (es && (es->es_lblk <= end)) {
+ if (ext4_es_is_delonly(es)) {
+ if (es->es_lblk <= start)
+ first_lclu = EXT4_B2C(sbi, start);
+ else
+ first_lclu = EXT4_B2C(sbi, es->es_lblk);
+
+ if (ext4_es_end(es) >= end)
+ last_lclu = EXT4_B2C(sbi, end);
+ else
+ last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
+
+ if (first_lclu == last_counted_lclu)
+ n += last_lclu - first_lclu;
+ else
+ n += last_lclu - first_lclu + 1;
+ last_counted_lclu = last_lclu;
+ }
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ return n;
+}
+
+/*
+ * ext4_es_delayed_clu - count number of clusters containing blocks that
+ * are both delayed and unwritten
+ *
+ * @inode - file containing block range
+ * @lblk - logical block defining start of range
+ * @len - number of blocks in range
+ *
+ * Locking for external use of __es_delayed_clu().
+ */
+unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t end;
+ unsigned int n;
+
+ if (len == 0)
+ return 0;
+
+ end = lblk + len - 1;
+ WARN_ON(end < lblk);
+
+ read_lock(&ei->i_es_lock);
+
+ n = __es_delayed_clu(inode, lblk, end);
+
+ read_unlock(&ei->i_es_lock);
+
+ return n;
+}
+
+/*
+ * __revise_pending - makes, cancels, or leaves unchanged pending cluster
+ * reservations for a specified block range depending
+ * upon the presence or absence of delayed blocks
+ * outside the range within clusters at the ends of the
+ * range
+ *
+ * @inode - file containing the range
+ * @lblk - logical block defining the start of range
+ * @len - length of range in blocks
+ *
+ * Used after a newly allocated extent is added to the extents status tree.
+ * Requires that the extents in the range have either written or unwritten
+ * status. Must be called while holding i_es_lock.
+ */
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t end = lblk + len - 1;
+ ext4_lblk_t first, last;
+ bool f_del = false, l_del = false;
+
+ if (len == 0)
+ return;
+
+ /*
+ * Two cases - block range within single cluster and block range
+ * spanning two or more clusters. Note that a cluster belonging
+ * to a range starting and/or ending on a cluster boundary is treated
+ * as if it does not contain a delayed extent. The new range may
+ * have allocated space for previously delayed blocks out to the
+ * cluster boundary, requiring that any pre-existing pending
+ * reservation be canceled. Because this code only looks at blocks
+ * outside the range, it should revise pending reservations
+ * correctly even if the extent represented by the range can't be
+ * inserted in the extents status tree due to ENOSPC.
+ */
+
+ if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
+ first = EXT4_LBLK_CMASK(sbi, lblk);
+ if (first != lblk)
+ f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ first, lblk - 1);
+ if (f_del) {
+ __insert_pending(inode, first);
+ } else {
+ last = EXT4_LBLK_CMASK(sbi, end) +
+ sbi->s_cluster_ratio - 1;
+ if (last != end)
+ l_del = __es_scan_range(inode,
+ &ext4_es_is_delonly,
+ end + 1, last);
+ if (l_del)
+ __insert_pending(inode, last);
+ else
+ __remove_pending(inode, last);
+ }
+ } else {
+ first = EXT4_LBLK_CMASK(sbi, lblk);
+ if (first != lblk)
+ f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ first, lblk - 1);
+ if (f_del)
+ __insert_pending(inode, first);
+ else
+ __remove_pending(inode, first);
+
+ last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
+ if (last != end)
+ l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ end + 1, last);
+ if (l_del)
+ __insert_pending(inode, last);
+ else
+ __remove_pending(inode, last);
+ }
+}
+
+/*
+ * ext4_es_remove_blks - remove block range from extents status tree and
+ * reduce reservation count or cancel pending
+ * reservation as needed
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @len - number of blocks to remove
+ *
+ */
+void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int clu_size, reserved = 0;
+ ext4_lblk_t last_lclu, first, length, remainder, last;
+ bool delonly;
+ int err = 0;
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree;
+
+ /*
+ * Process cluster by cluster for bigalloc - there may be up to
+ * two clusters in a 4k page with a 1k block size and two blocks
+ * per cluster. Also necessary for systems with larger page sizes
+ * and potentially larger block sizes.
+ */
+ clu_size = sbi->s_cluster_ratio;
+ last_lclu = EXT4_B2C(sbi, lblk + len - 1);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ for (first = lblk, remainder = len;
+ remainder > 0;
+ first += length, remainder -= length) {
+
+ if (EXT4_B2C(sbi, first) == last_lclu)
+ length = remainder;
+ else
+ length = clu_size - EXT4_LBLK_COFF(sbi, first);
+
+ /*
+ * The BH_Delay flag, which triggers calls to this function,
+ * and the contents of the extents status tree can be
+ * inconsistent due to writepages activity. So, note whether
+ * the blocks to be removed actually belong to an extent with
+ * delayed only status.
+ */
+ delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
+
+ /*
+ * because of the writepages effect, written and unwritten
+ * blocks could be removed here
+ */
+ last = first + length - 1;
+ err = __es_remove_extent(inode, first, last);
+ if (err)
+ ext4_warning(inode->i_sb,
+ "%s: couldn't remove page (err = %d)",
+ __func__, err);
+
+ /* non-bigalloc case: simply count the cluster for release */
+ if (sbi->s_cluster_ratio == 1 && delonly) {
+ reserved++;
+ continue;
+ }
+
+ /*
+ * bigalloc case: if all delayed allocated only blocks have
+ * just been removed from a cluster, either cancel a pending
+ * reservation if it exists or count a cluster for release
+ */
+ if (delonly &&
+ !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
+ pr = __get_pending(inode, EXT4_B2C(sbi, first));
+ if (pr != NULL) {
+ tree = &EXT4_I(inode)->i_pending_tree;
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ } else {
+ reserved++;
+ }
+ }
+ }
+
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_da_release_space(inode, reserved);
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb903d6b..131a8b7df265 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -78,6 +78,51 @@ struct ext4_es_stats {
struct percpu_counter es_stats_shk_cnt;
};
+/*
+ * Pending cluster reservations for bigalloc file systems
+ *
+ * A cluster with a pending reservation is a logical cluster shared by at
+ * least one extent in the extents status tree with delayed and unwritten
+ * status and at least one other written or unwritten extent. The
+ * reservation is said to be pending because a cluster reservation would
+ * have to be taken in the event all blocks in the cluster shared with
+ * written or unwritten extents were deleted while the delayed and
+ * unwritten blocks remained.
+ *
+ * The set of pending cluster reservations is an auxiliary data structure
+ * used with the extents status tree to implement reserved cluster/block
+ * accounting for bigalloc file systems. The set is kept in memory and
+ * records all pending cluster reservations.
+ *
+ * Its primary function is to avoid the need to read extents from the
+ * disk when invalidating pages as a result of a truncate, punch hole, or
+ * collapse range operation. Page invalidation requires a decrease in the
+ * reserved cluster count if it results in the removal of all delayed
+ * and unwritten extents (blocks) from a cluster that is not shared with a
+ * written or unwritten extent, and no decrease otherwise. Determining
+ * whether the cluster is shared can be done by searching for a pending
+ * reservation on it.
+ *
+ * Secondarily, it provides a potentially faster method for determining
+ * whether the reserved cluster count should be increased when a physical
+ * cluster is deallocated as a result of a truncate, punch hole, or
+ * collapse range operation. The necessary information is also present
+ * in the extents status tree, but might be more rapidly accessed in
+ * the pending reservation set in many cases due to smaller size.
+ *
+ * The pending cluster reservation set is implemented as a red-black tree
+ * with the goal of minimizing per page search time overhead.
+ */
+
+struct pending_reservation {
+ struct rb_node rb_node;
+ ext4_lblk_t lclu;
+};
+
+struct ext4_pending_tree {
+ struct rb_root root;
+};
+
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern void ext4_es_find_delayed_extent_range(struct inode *inode,
- ext4_lblk_t lblk, ext4_lblk_t end,
- struct extent_status *es);
+extern void ext4_es_find_extent_range(struct inode *inode,
+ int (*match_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
+extern bool ext4_es_scan_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end);
+extern bool ext4_es_scan_clu(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk);
static inline unsigned int ext4_es_status(struct extent_status *es)
{
@@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es)
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}
+static inline int ext4_es_is_mapped(struct extent_status *es)
+{
+ return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
+}
+
+static inline int ext4_es_is_delonly(struct extent_status *es)
+{
+ return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
+}
+
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+extern int __init ext4_init_pending(void);
+extern void ext4_exit_pending(void);
+extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
+extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
+extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated);
+extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
+extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
+
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7b4736022761..9c4bac18cc6c 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
- int retries;
+ int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d767e993591d..05f01fbd9c7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
- ext4_find_delalloc_range(inode, map->m_lblk,
- map->m_lblk + map->m_len - 1))
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk,
map->m_len, map->m_pblk, status);
@@ -701,8 +701,8 @@ found:
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
- ext4_find_delalloc_range(inode, map->m_lblk,
- map->m_lblk + map->m_len - 1))
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
@@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode)
return 0; /* success */
}
-static void ext4_da_release_space(struct inode *inode, int to_free)
+void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page,
unsigned int offset,
unsigned int length)
{
- int to_release = 0, contiguous_blks = 0;
+ int contiguous_blks = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int stop = offset + length;
- int num_clusters;
ext4_fsblk_t lblk;
BUG_ON(stop > PAGE_SIZE || stop < length);
@@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page,
break;
if ((offset <= curr_off) && (buffer_delay(bh))) {
- to_release++;
contiguous_blks++;
clear_buffer_delay(bh);
} else if (contiguous_blks) {
@@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page,
(PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) -
contiguous_blks;
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
+ ext4_es_remove_blks(inode, lblk, contiguous_blks);
contiguous_blks = 0;
}
curr_off = next_off;
@@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page,
if (contiguous_blks) {
lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
+ ext4_es_remove_blks(inode, lblk, contiguous_blks);
}
- /* If we have released all the blocks belonging to a cluster, then we
- * need to release the reserved space for that cluster. */
- num_clusters = EXT4_NUM_B2C(sbi, to_release);
- while (num_clusters > 0) {
- lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
- ((num_clusters - 1) << sbi->s_cluster_bits);
- if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk))
- ext4_da_release_space(inode, 1);
-
- num_clusters--;
- }
}
/*
@@ -1781,6 +1766,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
}
/*
+ * ext4_insert_delayed_block - adds a delayed block to the extents status
+ * tree, incrementing the reserved cluster/block
+ * count or making a pending reservation
+ * where needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+ bool allocated = false;
+
+ /*
+ * If the cluster containing lblk is shared with a delayed,
+ * written, or unwritten extent in a bigalloc file system, it's
+ * already been accounted for and does not need to be reserved.
+ * A pending reservation must be made for the cluster if it's
+ * shared with a written or unwritten extent and doesn't already
+ * have one. Written and unwritten extents can be purged from the
+ * extents status tree if the system is under memory pressure, so
+ * it's necessary to examine the extent tree if a search of the
+ * extents status tree doesn't get a match.
+ */
+ if (sbi->s_cluster_ratio == 1) {
+ ret = ext4_da_reserve_space(inode);
+ if (ret != 0) /* ENOSPC */
+ goto errout;
+ } else { /* bigalloc */
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
+ if (!ext4_es_scan_clu(inode,
+ &ext4_es_is_mapped, lblk)) {
+ ret = ext4_clu_mapped(inode,
+ EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ goto errout;
+ if (ret == 0) {
+ ret = ext4_da_reserve_space(inode);
+ if (ret != 0) /* ENOSPC */
+ goto errout;
+ } else {
+ allocated = true;
+ }
+ } else {
+ allocated = true;
+ }
+ }
+ }
+
+ ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+
+errout:
+ return ret;
+}
+
+/*
* This function is grabs code from the very beginning of
* ext4_map_blocks, but assumes that the caller is from delayed write
* time. This function looks up the requested blocks and sets the
@@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
add_delayed:
if (retval == 0) {
int ret;
+
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
- /*
- * If the block was allocated from previously allocated cluster,
- * then we don't need to reserve it again. However we still need
- * to reserve metadata for every block we're going to write.
- */
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
- ret = ext4_da_reserve_space(inode);
- if (ret) {
- /* not enough space to reserve */
- retval = ret;
- goto out_unlock;
- }
- }
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- ~0, EXTENT_STATUS_DELAYED);
- if (ret) {
+ ret = ext4_insert_delayed_block(inode, map->m_lblk);
+ if (ret != 0) {
retval = ret;
goto out_unlock;
}
@@ -2613,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
- int tag;
+ xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
@@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
struct extent_status es;
- ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map.m_lblk, end, &es);
if (!es.es_len || es.es_lblk > end) {
/* entire range is a hole */
@@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
return !buffer_mapped(bh);
}
-int ext4_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
loff_t size;
unsigned long len;
- int ret;
+ int err;
+ vm_fault_t ret;
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
@@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = ext4_convert_inline_data(inode);
- if (ret)
+ err = ext4_convert_inline_data(inode);
+ if (err)
goto out_ret;
/* Delalloc case is easy... */
@@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = block_page_mkwrite(vma, vmf,
+ err = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
- } while (ret == -ENOSPC &&
+ } while (err == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
goto out_ret;
}
@@ -6228,8 +6260,8 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = block_page_mkwrite(vma, vmf, get_block);
- if (!ret && ext4_should_journal_data(inode)) {
+ err = block_page_mkwrite(vma, vmf, get_block);
+ if (!err && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
@@ -6240,24 +6272,24 @@ retry_alloc:
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc;
out_ret:
- ret = block_page_mkwrite_return(ret);
+ ret = block_page_mkwrite_return(err);
out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
-int ext4_filemap_fault(struct vm_fault *vmf)
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- int err;
+ vm_fault_t ret;
down_read(&EXT4_I(inode)->i_mmap_sem);
- err = filemap_fault(vmf);
+ ret = filemap_fault(vmf);
up_read(&EXT4_I(inode)->i_mmap_sem);
- return err;
+ return ret;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a7074115d6f6..0edee31913d1 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
- swap(inode1->i_flags, inode2->i_flags);
swap(inode1->i_version, inode2->i_version);
swap(inode1->i_blocks, inode2->i_blocks);
swap(inode1->i_bytes, inode2->i_bytes);
@@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
i_size_write(inode2, isize);
}
+static void reset_inode_seed(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ __le32 inum = cpu_to_le32(inode->i_ino);
+ __le32 gen = cpu_to_le32(inode->i_generation);
+ __u32 csum;
+
+ if (!ext4_has_metadata_csum(inode->i_sb))
+ return;
+
+ csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
+}
+
/**
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
@@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+ IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
- if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+ if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+ !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
return -EPERM;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
@@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
- truncate_inode_pages(&inode->i_data, 0);
- truncate_inode_pages(&inode_bl->i_data, 0);
-
/* Wait for all existing dio workers */
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(&inode_bl->i_data, 0);
+
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
@@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_generation = prandom_u32();
inode_bl->i_generation = prandom_u32();
+ reset_inode_seed(inode);
+ reset_inode_seed(inode_bl);
ext4_discard_preallocations(inode);
@@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
} else {
err = ext4_mark_inode_dirty(handle, inode_bl);
if (err < 0) {
@@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, inode_bl);
}
}
ext4_journal_stop(handle);
@@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
return 0;
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
err = -EPERM;
- inode_lock(inode);
/* Is it quota file? Do not allow user to mess with it */
if (ext4_is_quota_file(inode))
- goto out_unlock;
+ return err;
err = ext4_get_inode_loc(inode, &iloc);
if (err)
- goto out_unlock;
+ return err;
raw_inode = ext4_raw_inode(&iloc);
if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
@@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
EXT4_SB(sb)->s_want_extra_isize,
&iloc);
if (err)
- goto out_unlock;
+ return err;
} else {
brelse(iloc.bh);
}
- dquot_initialize(inode);
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto out_unlock;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
@@ -400,9 +416,6 @@ out_dirty:
err = rc;
out_stop:
ext4_journal_stop(handle);
-out_unlock:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
return err;
}
#else
@@ -626,6 +639,30 @@ group_add_out:
return err;
}
+static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
+{
+ /*
+ * Project Quota ID state is only allowed to change from within the init
+ * namespace. Enforce that restriction only if we are trying to change
+ * the quota ID state. Everything else is allowed in user namespaces.
+ */
+ if (current_user_ns() == &init_user_ns)
+ return 0;
+
+ if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
+ return -EINVAL;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
+ if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
+ return -EINVAL;
+ } else {
+ if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1025,19 +1062,19 @@ resizefs_out:
return err;
inode_lock(inode);
+ err = ext4_ioctl_check_project(inode, &fa);
+ if (err)
+ goto out;
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
(flags & EXT4_FL_XFLAG_VISIBLE);
err = ext4_ioctl_setflags(inode, flags);
- inode_unlock(inode);
- mnt_drop_write_file(filp);
if (err)
- return err;
-
+ goto out;
err = ext4_ioctl_setproject(filp, fa.fsx_projid);
- if (err)
- return err;
-
- return 0;
+out:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return err;
}
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2fbf25..e2248083cdca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4915,9 +4915,17 @@ do_more:
&sbi->s_flex_groups[flex_group].free_clusters);
}
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
- dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+ /*
+ * on a bigalloc file system, defer the s_freeclusters_counter
+ * update to the caller (ext4_remove_space and friends) so they
+ * can determine if a cluster freed here should be rereserved
+ */
+ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ count_clusters);
+ }
ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index a409ff70d67b..2f5be02fc6f6 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_eof < orig_start + *len - 1)
+ if (orig_eof <= orig_start)
+ *len = 0;
+ else if (orig_eof < orig_start + *len - 1)
*len = orig_eof - orig_start;
- if (donor_eof < donor_start + *len - 1)
+ if (donor_eof <= donor_start)
+ *len = 0;
+ else if (donor_eof < donor_start + *len - 1)
*len = donor_eof - donor_start;
if (!*len) {
ext4_debug("ext4 move extent: len should not be 0 "
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 377d516c475f..67a38532032a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2261,7 +2261,7 @@ again:
dxroot->info.indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
- info->indirect_levels));
+ dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index db7590178dfc..2aa62d58d8dd 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
if (!bio)
return -ENOMEM;
- wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
+ wbc_init_bio(io->io_wbc, bio);
return 0;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1145109968ef..a221f1cdf704 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb)
for (type = 0; type < EXT4_MAXQUOTAS; type++)
ext4_quota_off(sb, type);
}
+
+/*
+ * This is a helper function which is used in the mount/remount
+ * codepaths (which holds s_umount) to fetch the quota file name.
+ */
+static inline char *get_qf_name(struct super_block *sb,
+ struct ext4_sb_info *sbi,
+ int type)
+{
+ return rcu_dereference_protected(sbi->s_qf_names[type],
+ lockdep_is_held(&sb->s_umount));
+}
#else
static inline void ext4_quota_off_umount(struct super_block *sb)
{
@@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
- kfree(sbi->s_qf_names[i]);
+ kfree(get_qf_name(sb, sbi, i));
#endif
/* Debugging code just in case the in-memory inode orphan list
@@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_da_metadata_calc_len = 0;
ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
@@ -1530,11 +1543,10 @@ static const char deprecated_msg[] =
static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *qname;
+ char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
int ret = -1;
- if (sb_any_quota_loaded(sb) &&
- !sbi->s_qf_names[qtype]) {
+ if (sb_any_quota_loaded(sb) && !old_qname) {
ext4_msg(sb, KERN_ERR,
"Cannot change journaled "
"quota options when quota turned on");
@@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return -1;
}
- if (sbi->s_qf_names[qtype]) {
- if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ if (old_qname) {
+ if (strcmp(old_qname, qname) == 0)
ret = 1;
else
ext4_msg(sb, KERN_ERR,
@@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quotafile must be on filesystem root");
goto errout;
}
- sbi->s_qf_names[qtype] = qname;
+ rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
set_opt(sb, QUOTA);
return 1;
errout:
@@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *old_qname = get_qf_name(sb, sbi, qtype);
- if (sb_any_quota_loaded(sb) &&
- sbi->s_qf_names[qtype]) {
+ if (sb_any_quota_loaded(sb) && old_qname) {
ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
" when quota turned on");
return -1;
}
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
+ synchronize_rcu();
+ kfree(old_qname);
return 1;
}
#endif
@@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb,
int is_remount)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char *p;
+ char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb,
"Cannot enable project quota enforcement.");
return 0;
}
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+ usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
+ grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
+ if (usr_qf_name || grp_qf_name) {
+ if (test_opt(sb, USRQUOTA) && usr_qf_name)
clear_opt(sb, USRQUOTA);
- if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+ if (test_opt(sb, GRPQUOTA) && grp_qf_name)
clear_opt(sb, GRPQUOTA);
if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
@@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
{
#if defined(CONFIG_QUOTA)
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *usr_qf_name, *grp_qf_name;
if (sbi->s_jquota_fmt) {
char *fmtname = "";
@@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
seq_printf(seq, ",jqfmt=%s", fmtname);
}
- if (sbi->s_qf_names[USRQUOTA])
- seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
-
- if (sbi->s_qf_names[GRPQUOTA])
- seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
+ rcu_read_lock();
+ usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
+ grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
+ if (usr_qf_name)
+ seq_show_option(seq, "usrjquota", usr_qf_name);
+ if (grp_qf_name)
+ seq_show_option(seq, "grpjquota", grp_qf_name);
+ rcu_read_unlock();
#endif
}
@@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
int err = 0;
#ifdef CONFIG_QUOTA
int i, j;
+ char *to_free[EXT4_MAXQUOTAS];
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++)
if (sbi->s_qf_names[i]) {
- old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
- GFP_KERNEL);
+ char *qf_name = get_qf_name(sb, sbi, i);
+
+ old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
if (!old_opts.s_qf_names[i]) {
for (j = 0; j < i; j++)
kfree(old_opts.s_qf_names[j]);
@@ -5352,9 +5373,12 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- kfree(sbi->s_qf_names[i]);
- sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+ to_free[i] = get_qf_name(sb, sbi, i);
+ rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
}
+ synchronize_rcu();
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(to_free[i]);
#endif
kfree(orig_data);
return err;
@@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type)
*/
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
- return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+ return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
EXT4_SB(sb)->s_jquota_fmt, type);
}
@@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void)
if (err)
return err;
+ err = ext4_init_pending();
+ if (err)
+ goto out6;
+
err = ext4_init_pageio();
if (err)
goto out5;
@@ -5992,6 +6020,8 @@ out3:
out4:
ext4_exit_pageio();
out5:
+ ext4_exit_pending();
+out6:
ext4_exit_es();
return err;
@@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void)
ext4_exit_system_zone();
ext4_exit_pageio();
ext4_exit_es();
+ ext4_exit_pending();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 111824199a88..fa707cdd4120 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/acl.c
*
@@ -7,10 +8,6 @@
* Portions of this code from linux/fs/ext2/acl.c
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/f2fs_fs.h>
#include "f2fs.h"
@@ -53,6 +50,9 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
const char *end = value + size;
+ if (size < sizeof(struct f2fs_acl_header))
+ return ERR_PTR(-EINVAL);
+
if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
return ERR_PTR(-EINVAL);
@@ -394,12 +394,16 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 2c685185c24d..b96823c59b15 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/acl.h
*
@@ -7,10 +8,6 @@
* Portions of this code from linux/fs/ext2/acl.h
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef __F2FS_ACL_H__
#define __F2FS_ACL_H__
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e8b6b89bddb8..9c28ea439e0b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/checkpoint.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/bio.h>
@@ -122,11 +119,8 @@ retry:
if (PTR_ERR(page) == -EIO &&
++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
-
f2fs_stop_checkpoint(sbi, false);
- f2fs_bug_on(sbi, 1);
}
-
return page;
}
@@ -282,8 +276,7 @@ static int __f2fs_write_meta_page(struct page *page,
dec_page_count(sbi, F2FS_DIRTY_META);
if (wbc->for_reclaim)
- f2fs_submit_merged_write_cond(sbi, page->mapping->host,
- 0, page->index, META);
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META);
unlock_page(page);
@@ -696,6 +689,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
/* clear Orphan Flag */
clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
out:
+ set_sbi_flag(sbi, SBI_IS_RECOVERED);
+
#ifdef CONFIG_QUOTA
/* Turn quotas off */
if (quota_enabled)
@@ -1084,6 +1079,21 @@ static void __prepare_cp_block(struct f2fs_sb_info *sbi)
ckpt->next_free_nid = cpu_to_le32(last_nid);
}
+static bool __need_flush_quota(struct f2fs_sb_info *sbi)
+{
+ if (!is_journalled_quota(sbi))
+ return false;
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
+ return false;
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
+ return false;
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH))
+ return true;
+ if (get_pages(sbi, F2FS_DIRTY_QDATA))
+ return true;
+ return false;
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -1095,12 +1105,36 @@ static int block_operations(struct f2fs_sb_info *sbi)
.for_reclaim = 0,
};
struct blk_plug plug;
- int err = 0;
+ int err = 0, cnt = 0;
blk_start_plug(&plug);
-retry_flush_dents:
+retry_flush_quotas:
+ if (__need_flush_quota(sbi)) {
+ int locked;
+
+ if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
+ set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
+ f2fs_lock_all(sbi);
+ goto retry_flush_dents;
+ }
+ clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
+
+ /* only failed during mount/umount/freeze/quotactl */
+ locked = down_read_trylock(&sbi->sb->s_umount);
+ f2fs_quota_sync(sbi->sb, -1);
+ if (locked)
+ up_read(&sbi->sb->s_umount);
+ }
+
f2fs_lock_all(sbi);
+ if (__need_flush_quota(sbi)) {
+ f2fs_unlock_all(sbi);
+ cond_resched();
+ goto retry_flush_quotas;
+ }
+
+retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
@@ -1108,7 +1142,7 @@ retry_flush_dents:
if (err)
goto out;
cond_resched();
- goto retry_flush_dents;
+ goto retry_flush_quotas;
}
/*
@@ -1117,6 +1151,12 @@ retry_flush_dents:
*/
down_write(&sbi->node_change);
+ if (__need_flush_quota(sbi)) {
+ up_write(&sbi->node_change);
+ f2fs_unlock_all(sbi);
+ goto retry_flush_quotas;
+ }
+
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
@@ -1124,7 +1164,7 @@ retry_flush_dents:
if (err)
goto out;
cond_resched();
- goto retry_flush_dents;
+ goto retry_flush_quotas;
}
retry_flush_nodes:
@@ -1215,6 +1255,19 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ __set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
+
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
+ __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+
+ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
+ __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
+
/* set this flag to activate crc|cp_ver for recovery */
__set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
__clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG);
@@ -1422,6 +1475,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
clear_sbi_flag(sbi, SBI_IS_DIRTY);
clear_sbi_flag(sbi, SBI_NEED_CP);
+ clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
+ sbi->unusable_block_count = 0;
__set_cp_next_pack(sbi);
/*
@@ -1446,6 +1501,12 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned long long ckpt_ver;
int err = 0;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (cpc->reason != CP_PAUSE)
+ return 0;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Start checkpoint disabled!");
+ }
mutex_lock(&sbi->cp_mutex);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
@@ -1497,7 +1558,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
/* write cached NAT/SIT entries to NAT/SIT area */
- f2fs_flush_nat_entries(sbi, cpc);
+ err = f2fs_flush_nat_entries(sbi, cpc);
+ if (err)
+ goto stop;
+
f2fs_flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
@@ -1506,7 +1570,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_release_discard_addrs(sbi);
else
f2fs_clear_prefree_segments(sbi, cpc);
-
+stop:
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 382c1ef9a9e4..b293cb3e27a2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/data.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -49,12 +46,29 @@ static bool __is_cp_guaranteed(struct page *page)
inode->i_ino == F2FS_NODE_INO(sbi) ||
S_ISDIR(inode->i_mode) ||
(S_ISREG(inode->i_mode) &&
- is_inode_flag_set(inode, FI_ATOMIC_FILE)) ||
+ (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
is_cold_data(page))
return true;
return false;
}
+static enum count_type __read_io_type(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ if (mapping) {
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi))
+ return F2FS_RD_META;
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi))
+ return F2FS_RD_NODE;
+ }
+ return F2FS_RD_DATA;
+}
+
/* postprocessing steps for read bios */
enum bio_post_read_step {
STEP_INITIAL = 0,
@@ -80,10 +94,12 @@ static void __read_end_io(struct bio *bio)
/* PG_error was set if any post_read step failed */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
- SetPageError(page);
+ /* will re-read again later */
+ ClearPageError(page);
} else {
SetPageUptodate(page);
}
+ dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
if (bio->bi_private)
@@ -126,8 +142,9 @@ static bool f2fs_bio_post_read_required(struct bio *bio)
static void f2fs_read_end_io(struct bio *bio)
{
- if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) {
- f2fs_show_injection_info(FAULT_IO);
+ if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)),
+ FAULT_READ_IO)) {
+ f2fs_show_injection_info(FAULT_READ_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -148,6 +165,11 @@ static void f2fs_write_end_io(struct bio *bio)
struct bio_vec *bvec;
int i;
+ if (time_to_inject(sbi, FAULT_WRITE_IO)) {
+ f2fs_show_injection_info(FAULT_WRITE_IO);
+ bio->bi_status = BLK_STS_IOERR;
+ }
+
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
enum count_type type = WB_DATA_TYPE(page);
@@ -319,8 +341,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
io->bio = NULL;
}
-static bool __has_merged_page(struct f2fs_bio_info *io,
- struct inode *inode, nid_t ino, pgoff_t idx)
+static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
+ struct page *page, nid_t ino)
{
struct bio_vec *bvec;
struct page *target;
@@ -329,7 +351,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io,
if (!io->bio)
return false;
- if (!inode && !ino)
+ if (!inode && !page && !ino)
return true;
bio_for_each_segment_all(bvec, io->bio, i) {
@@ -339,11 +361,10 @@ static bool __has_merged_page(struct f2fs_bio_info *io,
else
target = fscrypt_control_page(bvec->bv_page);
- if (idx != target->index)
- continue;
-
if (inode && inode == target->mapping->host)
return true;
+ if (page && page == target)
+ return true;
if (ino && ino == ino_of_node(target))
return true;
}
@@ -352,7 +373,8 @@ static bool __has_merged_page(struct f2fs_bio_info *io,
}
static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
- nid_t ino, pgoff_t idx, enum page_type type)
+ struct page *page, nid_t ino,
+ enum page_type type)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
enum temp_type temp;
@@ -363,7 +385,7 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
io = sbi->write_io[btype] + temp;
down_read(&io->io_rwsem);
- ret = __has_merged_page(io, inode, ino, idx);
+ ret = __has_merged_page(io, inode, page, ino);
up_read(&io->io_rwsem);
/* TODO: use HOT temp only for meta pages now. */
@@ -394,12 +416,12 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, nid_t ino, pgoff_t idx,
- enum page_type type, bool force)
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, bool force)
{
enum temp_type temp;
- if (!force && !has_merged_page(sbi, inode, ino, idx, type))
+ if (!force && !has_merged_page(sbi, inode, page, ino, type))
return;
for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
@@ -418,10 +440,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
}
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, nid_t ino, pgoff_t idx,
- enum page_type type)
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type)
{
- __submit_merged_write_cond(sbi, inode, ino, idx, type, false);
+ __submit_merged_write_cond(sbi, inode, page, ino, type, false);
}
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
@@ -456,12 +478,16 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
bio_put(bio);
return -EFAULT;
}
+
+ if (fio->io_wbc && !is_read_io(fio->op))
+ wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+
bio_set_op_attrs(bio, fio->op, fio->op_flags);
- __submit_bio(fio->sbi, bio, fio->type);
+ inc_page_count(fio->sbi, is_read_io(fio->op) ?
+ __read_io_type(page): WB_DATA_TYPE(fio->page));
- if (!is_read_io(fio->op))
- inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
+ __submit_bio(fio->sbi, bio, fio->type);
return 0;
}
@@ -533,6 +559,9 @@ skip:
if (fio->in_list)
goto next;
out:
+ if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+ f2fs_is_checkpoint_ready(sbi))
+ __submit_merged_bio(io);
up_write(&io->io_rwsem);
}
@@ -565,9 +594,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
ctx->bio = bio;
ctx->enabled_steps = post_read_steps;
bio->bi_private = ctx;
-
- /* wait the page to be moved by cleaning */
- f2fs_wait_on_block_writeback(sbi, blkaddr);
}
return bio;
@@ -582,10 +608,15 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
if (IS_ERR(bio))
return PTR_ERR(bio);
+ /* wait for GCed page writeback via META_MAPPING */
+ f2fs_wait_on_block_writeback(inode, blkaddr);
+
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
}
+ ClearPageError(page);
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
__submit_bio(F2FS_I_SB(inode), bio, DATA);
return 0;
}
@@ -876,7 +907,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
struct f2fs_summary sum;
struct node_info ni;
block_t old_blkaddr;
- pgoff_t fofs;
blkcnt_t count = 1;
int err;
@@ -889,7 +919,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
dn->data_blkaddr = datablock_addr(dn->inode,
dn->node_page, dn->ofs_in_node);
- if (dn->data_blkaddr == NEW_ADDR)
+ if (dn->data_blkaddr != NULL_ADDR)
goto alloc;
if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
@@ -905,12 +935,10 @@ alloc:
old_blkaddr, old_blkaddr);
f2fs_set_data_blkaddr(dn);
- /* update i_size */
- fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
- dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
- f2fs_i_size_write(dn->inode,
- ((loff_t)(fofs + 1) << PAGE_SHIFT));
+ /*
+ * i_size will be updated by direct_IO. Otherwise, we'll get stale
+ * data from unwritten block via dio_read.
+ */
return 0;
}
@@ -945,7 +973,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
if (direct_io) {
map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
- flag = f2fs_force_buffered_io(inode, WRITE) ?
+ flag = f2fs_force_buffered_io(inode, iocb, from) ?
F2FS_GET_BLOCK_PRE_AIO :
F2FS_GET_BLOCK_PRE_DIO;
goto map_blocks;
@@ -970,7 +998,7 @@ map_blocks:
return err;
}
-static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
@@ -1025,6 +1053,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
map->m_flags = F2FS_MAP_MAPPED;
if (map->m_next_extent)
*map->m_next_extent = pgofs + map->m_len;
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
goto out;
}
@@ -1064,7 +1097,15 @@ next_block:
goto sync_out;
}
- if (!is_valid_data_blkaddr(sbi, blkaddr)) {
+ if (is_valid_data_blkaddr(sbi, blkaddr)) {
+ /* use out-place-update for driect IO under LFS mode */
+ if (test_opt(sbi, LFS) && create &&
+ flag == F2FS_GET_BLOCK_DIO) {
+ err = __allocate_data_block(&dn, map->m_seg_type);
+ if (!err)
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ }
+ } else {
if (create) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
@@ -1076,6 +1117,8 @@ next_block:
last_ofs_in_node = dn.ofs_in_node;
}
} else {
+ WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
+ flag != F2FS_GET_BLOCK_DIO);
err = __allocate_data_block(&dn,
map->m_seg_type);
if (!err)
@@ -1173,6 +1216,12 @@ skip:
goto next_dnode;
sync_out:
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
+
if (flag == F2FS_GET_BLOCK_PRECACHE) {
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
@@ -1255,7 +1304,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DEFAULT, NULL,
+ F2FS_GET_BLOCK_DIO, NULL,
f2fs_rw_hint_to_seg_type(
inode->i_write_hint));
}
@@ -1558,9 +1607,17 @@ submit_and_realloc:
}
}
+ /*
+ * If the page is under writeback, we need to wait for
+ * its completion to see the correct decrypted data.
+ */
+ f2fs_wait_on_block_writeback(inode, block_nr);
+
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
+ inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
+ ClearPageError(page);
last_block_in_bio = block_nr;
goto next_page;
set_error_page:
@@ -1625,7 +1682,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio)
return 0;
/* wait for GCed page writeback via META_MAPPING */
- f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr);
+ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
@@ -1682,6 +1739,10 @@ static inline bool check_inplace_update_policy(struct inode *inode,
is_inode_flag_set(inode, FI_NEED_IPU))
return true;
+ if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+ return true;
+
return false;
}
@@ -1705,6 +1766,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (S_ISDIR(inode->i_mode))
return true;
+ if (IS_NOQUOTA(inode))
+ return true;
if (f2fs_is_atomic_file(inode))
return true;
if (fio) {
@@ -1712,6 +1775,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
return true;
if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
return true;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
+ f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
+ return true;
}
return false;
}
@@ -1763,6 +1829,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
+ clear_cold_data(page);
goto out_writepage;
}
got_it:
@@ -1938,18 +2005,20 @@ done:
out:
inode_dec_dirty_pages(inode);
- if (err)
+ if (err) {
ClearPageUptodate(page);
+ clear_cold_data(page);
+ }
if (wbc->for_reclaim) {
- f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
+ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
clear_inode_flag(inode, FI_HOT_DATA);
f2fs_remove_dirty_inode(inode);
submitted = NULL;
}
unlock_page(page);
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode))
f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
@@ -2000,10 +2069,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- pgoff_t last_idx = ULONG_MAX;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
+ int nwritten = 0;
pagevec_init(&pvec);
@@ -2106,7 +2175,7 @@ continue_unlock:
done = 1;
break;
} else if (submitted) {
- last_idx = page->index;
+ nwritten++;
}
if (--wbc->nr_to_write <= 0 &&
@@ -2128,9 +2197,9 @@ continue_unlock:
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
- if (last_idx != ULONG_MAX)
+ if (nwritten)
f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
- 0, last_idx, DATA);
+ NULL, 0, DATA);
return ret;
}
@@ -2140,6 +2209,8 @@ static inline bool __should_serialize_io(struct inode *inode,
{
if (!S_ISREG(inode->i_mode))
return false;
+ if (IS_NOQUOTA(inode))
+ return false;
if (wbc->sync_mode != WB_SYNC_ALL)
return true;
if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
@@ -2169,7 +2240,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
- if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+ if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) &&
+ wbc->sync_mode == WB_SYNC_NONE &&
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
@@ -2234,7 +2306,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
- f2fs_truncate_blocks(inode, i_size, true);
+ f2fs_truncate_blocks(inode, i_size, true, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -2332,6 +2404,10 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
trace_f2fs_write_begin(inode, pos, len, flags);
+ err = f2fs_is_checkpoint_ready(sbi);
+ if (err)
+ goto fail;
+
if ((f2fs_is_atomic_file(inode) &&
!f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
@@ -2369,7 +2445,8 @@ repeat:
if (err)
goto fail;
- if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) {
+ if (need_balance && !IS_NOQUOTA(inode) &&
+ has_not_enough_free_secs(sbi, 0, 0)) {
unlock_page(page);
f2fs_balance_fs(sbi, true);
lock_page(page);
@@ -2382,10 +2459,6 @@ repeat:
f2fs_wait_on_page_writeback(page, DATA, false);
- /* wait for GCed page writeback via META_MAPPING */
- if (f2fs_post_read_required(inode))
- f2fs_wait_on_block_writeback(sbi, blkaddr);
-
if (len == PAGE_SIZE || PageUptodate(page))
return 0;
@@ -2480,36 +2553,53 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
int rw = iov_iter_rw(iter);
int err;
enum rw_hint hint = iocb->ki_hint;
int whint_mode = F2FS_OPTION(sbi).whint_mode;
+ bool do_opu;
err = check_direct_IO(inode, iter, offset);
if (err)
return err < 0 ? err : 0;
- if (f2fs_force_buffered_io(inode, rw))
+ if (f2fs_force_buffered_io(inode, iocb, iter))
return 0;
+ do_opu = allow_outplace_dio(inode, iocb, iter);
+
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
iocb->ki_hint = WRITE_LIFE_NOT_SET;
- if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!down_read_trylock(&fi->i_gc_rwsem[rw])) {
iocb->ki_hint = hint;
err = -EAGAIN;
goto out;
}
- down_read(&F2FS_I(inode)->i_gc_rwsem[rw]);
+ if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ up_read(&fi->i_gc_rwsem[rw]);
+ iocb->ki_hint = hint;
+ err = -EAGAIN;
+ goto out;
+ }
+ } else {
+ down_read(&fi->i_gc_rwsem[rw]);
+ if (do_opu)
+ down_read(&fi->i_gc_rwsem[READ]);
}
err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
- up_read(&F2FS_I(inode)->i_gc_rwsem[rw]);
+
+ if (do_opu)
+ up_read(&fi->i_gc_rwsem[READ]);
+
+ up_read(&fi->i_gc_rwsem[rw]);
if (rw == WRITE) {
if (whint_mode == WHINT_MODE_OFF)
@@ -2517,7 +2607,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (err > 0) {
f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
err);
- set_inode_flag(inode, FI_UPDATE_WRITE);
+ if (!do_opu)
+ set_inode_flag(inode, FI_UPDATE_WRITE);
} else if (err < 0) {
f2fs_write_failed(mapping, offset + count);
}
@@ -2550,6 +2641,8 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
}
}
+ clear_cold_data(page);
+
/* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
return f2fs_drop_inmem_page(inode, page);
@@ -2568,6 +2661,7 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (IS_ATOMIC_WRITTEN_PAGE(page))
return 0;
+ clear_cold_data(page);
set_page_private(page, 0);
ClearPagePrivate(page);
return 1;
@@ -2583,10 +2677,6 @@ static int f2fs_set_data_page_dirty(struct page *page)
if (!PageUptodate(page))
SetPageUptodate(page);
- /* don't remain PG_checked flag which was set during GC */
- if (is_cold_data(page))
- clear_cold_data(page);
-
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
f2fs_register_inmem_page(inode, page);
@@ -2697,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = {
#endif
};
-void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 214a968962a1..139b4d5c83d5 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs debugging statistics
*
@@ -5,10 +6,6 @@
* http://www.samsung.com/
* Copyright (c) 2012 Linux Foundation
* Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
@@ -58,6 +55,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
+ si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA);
+ si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE);
+ si->nr_rd_meta = get_pages(sbi, F2FS_RD_META);
if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
si->nr_flushed =
atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
@@ -104,6 +104,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->avail_nids = NM_I(sbi)->available_nids;
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->bg_gc = sbi->bg_gc;
+ si->io_skip_bggc = sbi->io_skip_bggc;
+ si->other_skip_bggc = sbi->other_skip_bggc;
si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -121,6 +123,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
}
+ for (i = META_CP; i < META_MAX; i++)
+ si->meta_count[i] = atomic_read(&sbi->meta_count[i]);
+
for (i = 0; i < 2; i++) {
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
@@ -190,8 +195,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
- if (f2fs_discard_en(sbi))
- si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
si->base_mem += SIT_VBLOCK_MAP_SIZE;
if (sbi->segs_per_sec > 1)
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
@@ -271,7 +275,8 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
si->sbi->sb->s_bdev, i++,
f2fs_readonly(si->sbi->sb) ? "RO": "RW",
- f2fs_cp_error(si->sbi) ? "Error": "Good");
+ is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
+ "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good"));
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -333,6 +338,13 @@ static int stat_show(struct seq_file *s, void *v)
si->prefree_count, si->free_segs, si->free_secs);
seq_printf(s, "CP calls: %d (BG: %d)\n",
si->cp_count, si->bg_cp_count);
+ seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]);
+ seq_printf(s, " - sit blocks : %u\n",
+ si->meta_count[META_SIT]);
+ seq_printf(s, " - nat blocks : %u\n",
+ si->meta_count[META_NAT]);
+ seq_printf(s, " - ssa blocks : %u\n",
+ si->meta_count[META_SSA]);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
@@ -349,6 +361,8 @@ static int stat_show(struct seq_file *s, void *v)
si->skipped_atomic_files[BG_GC] +
si->skipped_atomic_files[FG_GC],
si->skipped_atomic_files[BG_GC]);
+ seq_printf(s, "BG skip : IO: %u, Other: %u\n",
+ si->io_skip_bggc, si->other_skip_bggc);
seq_puts(s, "\nExtent Cache:\n");
seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
si->hit_largest, si->hit_cached,
@@ -360,7 +374,9 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
+ seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
+ si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
+ seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
si->nr_wb_cp_data, si->nr_wb_data,
si->nr_flushing, si->nr_flushed,
@@ -445,6 +461,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
if (!si)
@@ -470,6 +487,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->inplace_count, 0);
+ for (i = META_CP; i < META_MAX; i++)
+ atomic_set(&sbi->meta_count[i], 0);
atomic_set(&sbi->aw_cnt, 0);
atomic_set(&sbi->vw_cnt, 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index ecc3a4e2be96..bacc667950b6 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/dir.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
@@ -658,9 +655,9 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
f2fs_put_page(page, 1);
clear_inode_flag(inode, FI_NEW_INODE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
up_write(&F2FS_I(inode)->i_sem);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
@@ -729,10 +726,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
+ clear_cold_data(page);
inode_dec_dirty_pages(dir);
f2fs_remove_dirty_inode(dir);
}
@@ -784,9 +782,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct f2fs_dir_entry *de = NULL;
struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
+ struct blk_plug plug;
+ bool readdir_ra = sbi->readdir_ra == 1;
+ int err = 0;
bit_pos = ((unsigned long)ctx->pos % d->max);
+ if (readdir_ra)
+ blk_start_plug(&plug);
+
while (bit_pos < d->max) {
bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos);
if (bit_pos >= d->max)
@@ -806,29 +810,33 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (f2fs_encrypted_inode(d->inode)) {
int save_len = fstr->len;
- int err;
err = fscrypt_fname_disk_to_usr(d->inode,
(u32)de->hash_code, 0,
&de_name, fstr);
if (err)
- return err;
+ goto out;
de_name = *fstr;
fstr->len = save_len;
}
if (!dir_emit(ctx, de_name.name, de_name.len,
- le32_to_cpu(de->ino), d_type))
- return 1;
+ le32_to_cpu(de->ino), d_type)) {
+ err = 1;
+ goto out;
+ }
- if (sbi->readdir_ra == 1)
+ if (readdir_ra)
f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
ctx->pos = start_pos + bit_pos;
}
- return 0;
+out:
+ if (readdir_ra)
+ blk_finish_plug(&plug);
+ return err;
}
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 231b77ef5a53..1cb0fcc67d2d 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* f2fs extent cache support
*
@@ -5,10 +6,6 @@
* Copyright (c) 2015 Samsung Electronics
* Authors: Jaegeuk Kim <jaegeuk@kernel.org>
* Chao Yu <chao2.yu@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>
@@ -30,10 +27,10 @@ static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
return NULL;
}
-static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root,
unsigned int ofs)
{
- struct rb_node *node = root->rb_node;
+ struct rb_node *node = root->rb_root.rb_node;
struct rb_entry *re;
while (node) {
@@ -49,7 +46,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
return NULL;
}
-struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs)
{
struct rb_entry *re;
@@ -62,22 +59,25 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
}
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
- struct rb_root *root, struct rb_node **parent,
- unsigned int ofs)
+ struct rb_root_cached *root,
+ struct rb_node **parent,
+ unsigned int ofs, bool *leftmost)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_entry *re;
while (*p) {
*parent = *p;
re = rb_entry(*parent, struct rb_entry, rb_node);
- if (ofs < re->ofs)
+ if (ofs < re->ofs) {
p = &(*p)->rb_left;
- else if (ofs >= re->ofs + re->len)
+ } else if (ofs >= re->ofs + re->len) {
p = &(*p)->rb_right;
- else
+ *leftmost = false;
+ } else {
f2fs_bug_on(sbi, 1);
+ }
}
return p;
@@ -92,16 +92,16 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
* in order to simpfy the insertion after.
* tree must stay unchanged between lookup and insertion.
*/
-struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
struct rb_entry *cached_re,
unsigned int ofs,
struct rb_entry **prev_entry,
struct rb_entry **next_entry,
struct rb_node ***insert_p,
struct rb_node **insert_parent,
- bool force)
+ bool force, bool *leftmost)
{
- struct rb_node **pnode = &root->rb_node;
+ struct rb_node **pnode = &root->rb_root.rb_node;
struct rb_node *parent = NULL, *tmp_node;
struct rb_entry *re = cached_re;
@@ -110,7 +110,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
*prev_entry = NULL;
*next_entry = NULL;
- if (RB_EMPTY_ROOT(root))
+ if (RB_EMPTY_ROOT(&root->rb_root))
return NULL;
if (re) {
@@ -118,16 +118,22 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
goto lookup_neighbors;
}
+ if (leftmost)
+ *leftmost = true;
+
while (*pnode) {
parent = *pnode;
re = rb_entry(*pnode, struct rb_entry, rb_node);
- if (ofs < re->ofs)
+ if (ofs < re->ofs) {
pnode = &(*pnode)->rb_left;
- else if (ofs >= re->ofs + re->len)
+ } else if (ofs >= re->ofs + re->len) {
pnode = &(*pnode)->rb_right;
- else
+ if (leftmost)
+ *leftmost = false;
+ } else {
goto lookup_neighbors;
+ }
}
*insert_p = pnode;
@@ -160,10 +166,10 @@ lookup_neighbors:
}
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
- struct rb_root *root)
+ struct rb_root_cached *root)
{
#ifdef CONFIG_F2FS_CHECK_FS
- struct rb_node *cur = rb_first(root), *next;
+ struct rb_node *cur = rb_first_cached(root), *next;
struct rb_entry *cur_re, *next_re;
if (!cur)
@@ -196,7 +202,8 @@ static struct kmem_cache *extent_node_slab;
static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
- struct rb_node *parent, struct rb_node **p)
+ struct rb_node *parent, struct rb_node **p,
+ bool leftmost)
{
struct extent_node *en;
@@ -209,7 +216,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
en->et = et;
rb_link_node(&en->rb_node, parent, p);
- rb_insert_color(&en->rb_node, &et->root);
+ rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
atomic_inc(&et->node_cnt);
atomic_inc(&sbi->total_ext_node);
return en;
@@ -218,7 +225,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
static void __detach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
- rb_erase(&en->rb_node, &et->root);
+ rb_erase_cached(&en->rb_node, &et->root);
atomic_dec(&et->node_cnt);
atomic_dec(&sbi->total_ext_node);
@@ -257,7 +264,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
memset(et, 0, sizeof(struct extent_tree));
et->ino = ino;
- et->root = RB_ROOT;
+ et->root = RB_ROOT_CACHED;
et->cached_en = NULL;
rwlock_init(&et->lock);
INIT_LIST_HEAD(&et->list);
@@ -278,10 +285,10 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei)
{
- struct rb_node **p = &et->root.rb_node;
+ struct rb_node **p = &et->root.rb_root.rb_node;
struct extent_node *en;
- en = __attach_extent_node(sbi, et, ei, NULL, p);
+ en = __attach_extent_node(sbi, et, ei, NULL, p, true);
if (!en)
return NULL;
@@ -297,7 +304,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
struct extent_node *en;
unsigned int count = atomic_read(&et->node_cnt);
- node = rb_first(&et->root);
+ node = rb_first_cached(&et->root);
while (node) {
next = rb_next(node);
en = rb_entry(node, struct extent_node, rb_node);
@@ -308,14 +315,13 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
return count - atomic_read(&et->node_cnt);
}
-static void __drop_largest_extent(struct inode *inode,
+static void __drop_largest_extent(struct extent_tree *et,
pgoff_t fofs, unsigned int len)
{
- struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
-
- if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
- largest->len = 0;
- f2fs_mark_inode_dirty_sync(inode, true);
+ if (fofs < et->largest.fofs + et->largest.len &&
+ fofs + len > et->largest.fofs) {
+ et->largest.len = 0;
+ et->largest_updated = true;
}
}
@@ -416,12 +422,11 @@ out:
return ret;
}
-static struct extent_node *__try_merge_extent_node(struct inode *inode,
+static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_node *en = NULL;
if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
@@ -443,7 +448,7 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode,
if (!en)
return NULL;
- __try_update_largest_extent(inode, et, en);
+ __try_update_largest_extent(et, en);
spin_lock(&sbi->extent_lock);
if (!list_empty(&en->list)) {
@@ -454,12 +459,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode,
return en;
}
-static struct extent_node *__insert_extent_tree(struct inode *inode,
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
struct rb_node **insert_p,
- struct rb_node *insert_parent)
+ struct rb_node *insert_parent,
+ bool leftmost)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct rb_node **p;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -470,13 +475,16 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
goto do_insert;
}
- p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
+ leftmost = true;
+
+ p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent,
+ ei->fofs, &leftmost);
do_insert:
- en = __attach_extent_node(sbi, et, ei, parent, p);
+ en = __attach_extent_node(sbi, et, ei, parent, p, leftmost);
if (!en)
return NULL;
- __try_update_largest_extent(inode, et, en);
+ __try_update_largest_extent(et, en);
/* update in global extent list */
spin_lock(&sbi->extent_lock);
@@ -497,6 +505,8 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
struct rb_node **insert_p = NULL, *insert_parent = NULL;
unsigned int end = fofs + len;
unsigned int pos = (unsigned int)fofs;
+ bool updated = false;
+ bool leftmost;
if (!et)
return;
@@ -517,14 +527,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
* drop largest extent before lookup, in case it's already
* been shrunk from extent tree
*/
- __drop_largest_extent(inode, fofs, len);
+ __drop_largest_extent(et, fofs, len);
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
(struct rb_entry *)et->cached_en, fofs,
(struct rb_entry **)&prev_en,
(struct rb_entry **)&next_en,
- &insert_p, &insert_parent, false);
+ &insert_p, &insert_parent, false,
+ &leftmost);
if (!en)
en = next_en;
@@ -550,8 +561,8 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
set_extent_info(&ei, end,
end - dei.fofs + dei.blk,
org_end - end);
- en1 = __insert_extent_tree(inode, et, &ei,
- NULL, NULL);
+ en1 = __insert_extent_tree(sbi, et, &ei,
+ NULL, NULL, true);
next_en = en1;
} else {
en->ei.fofs = end;
@@ -570,7 +581,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
}
if (parts)
- __try_update_largest_extent(inode, et, en);
+ __try_update_largest_extent(et, en);
else
__release_extent_node(sbi, et, en);
@@ -590,15 +601,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
if (blkaddr) {
set_extent_info(&ei, fofs, blkaddr, len);
- if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en))
- __insert_extent_tree(inode, et, &ei,
- insert_p, insert_parent);
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
+ insert_p, insert_parent, leftmost);
/* give up extent_cache, if split and small updates happen */
if (dei.len >= 1 &&
prev.len < F2FS_MIN_EXTENT_LEN &&
et->largest.len < F2FS_MIN_EXTENT_LEN) {
- __drop_largest_extent(inode, 0, UINT_MAX);
+ et->largest.len = 0;
+ et->largest_updated = true;
set_inode_flag(inode, FI_NO_EXTENT);
}
}
@@ -606,7 +618,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
if (is_inode_flag_set(inode, FI_NO_EXTENT))
__free_extent_tree(sbi, et);
+ if (et->largest_updated) {
+ et->largest_updated = false;
+ updated = true;
+ }
+
write_unlock(&et->lock);
+
+ if (updated)
+ f2fs_mark_inode_dirty_sync(inode, true);
}
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -705,6 +725,7 @@ void f2fs_drop_extent_tree(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ bool updated = false;
if (!f2fs_may_extent_tree(inode))
return;
@@ -713,8 +734,13 @@ void f2fs_drop_extent_tree(struct inode *inode)
write_lock(&et->lock);
__free_extent_tree(sbi, et);
- __drop_largest_extent(inode, 0, UINT_MAX);
+ if (et->largest.len) {
+ et->largest.len = 0;
+ updated = true;
+ }
write_unlock(&et->lock);
+ if (updated)
+ f2fs_mark_inode_dirty_sync(inode, true);
}
void f2fs_destroy_extent_tree(struct inode *inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index abf925664d9c..1e031971a466 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1,16 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/f2fs.h
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef _LINUX_F2FS_H
#define _LINUX_F2FS_H
+#include <linux/uio.h>
#include <linux/types.h>
#include <linux/page-flags.h>
#include <linux/buffer_head.h>
@@ -53,9 +51,10 @@ enum {
FAULT_DIR_DEPTH,
FAULT_EVICT_INODE,
FAULT_TRUNCATE,
- FAULT_IO,
+ FAULT_READ_IO,
FAULT_CHECKPOINT,
FAULT_DISCARD,
+ FAULT_WRITE_IO,
FAULT_MAX,
};
@@ -100,6 +99,7 @@ extern char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_QUOTA 0x00400000
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
+#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -150,6 +150,7 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_INODE_CRTIME 0x0100
#define F2FS_FEATURE_LOST_FOUND 0x0200
#define F2FS_FEATURE_VERITY 0x0400 /* reserved */
+#define F2FS_FEATURE_SB_CHKSUM 0x0800
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -178,6 +179,7 @@ enum {
#define CP_RECOVERY 0x00000008
#define CP_DISCARD 0x00000010
#define CP_TRIMMED 0x00000020
+#define CP_PAUSE 0x00000040
#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */
@@ -187,6 +189,7 @@ enum {
#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
+#define DEF_DISABLE_INTERVAL 5 /* 5 secs */
struct cp_control {
int reason;
@@ -203,6 +206,7 @@ enum {
META_NAT,
META_SIT,
META_SSA,
+ META_MAX,
META_POR,
DATA_GENERIC,
META_GENERIC,
@@ -324,7 +328,7 @@ struct discard_cmd_control {
atomic_t issued_discard; /* # of issued discard */
atomic_t issing_discard; /* # of issing discard */
atomic_t discard_cmd_cnt; /* # of cached cmd count */
- struct rb_root root; /* root of discard rb-tree */
+ struct rb_root_cached root; /* root of discard rb-tree */
bool rbtree_check; /* config for consistence check */
};
@@ -527,6 +531,9 @@ enum {
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+/* maximum retry quota flush count */
+#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
@@ -566,12 +573,13 @@ struct extent_node {
struct extent_tree {
nid_t ino; /* inode number */
- struct rb_root root; /* root of extent info rb-tree */
+ struct rb_root_cached root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
struct extent_info largest; /* largested extent info */
struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
atomic_t node_cnt; /* # of extent node in rb-tree*/
+ bool largest_updated; /* largest extent updated */
};
/*
@@ -600,6 +608,7 @@ enum {
F2FS_GET_BLOCK_DEFAULT,
F2FS_GET_BLOCK_FIEMAP,
F2FS_GET_BLOCK_BMAP,
+ F2FS_GET_BLOCK_DIO,
F2FS_GET_BLOCK_PRE_DIO,
F2FS_GET_BLOCK_PRE_AIO,
F2FS_GET_BLOCK_PRECACHE,
@@ -754,12 +763,12 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
}
extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct inode *inode,
- struct extent_tree *et, struct extent_node *en)
+static inline void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
{
if (en->ei.len > et->largest.len) {
et->largest = en->ei;
- f2fs_mark_inode_dirty_sync(inode, true);
+ et->largest_updated = true;
}
}
@@ -944,6 +953,9 @@ enum count_type {
F2FS_DIRTY_IMETA,
F2FS_WB_CP_DATA,
F2FS_WB_DATA,
+ F2FS_RD_DATA,
+ F2FS_RD_NODE,
+ F2FS_RD_META,
NR_COUNT_TYPE,
};
@@ -1088,11 +1100,19 @@ enum {
SBI_NEED_SB_WRITE, /* need to recover superblock */
SBI_NEED_CP, /* need to checkpoint */
SBI_IS_SHUTDOWN, /* shutdown by ioctl */
+ SBI_IS_RECOVERED, /* recovered orphan/data */
+ SBI_CP_DISABLED, /* CP was disabled last mount */
+ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */
+ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
+ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
};
enum {
CP_TIME,
REQ_TIME,
+ DISCARD_TIME,
+ GC_TIME,
+ DISABLE_TIME,
MAX_TIME,
};
@@ -1209,7 +1229,6 @@ struct f2fs_sb_info {
unsigned int total_valid_node_count; /* valid node block count */
loff_t max_file_blocks; /* max block index of file */
int dir_level; /* directory level */
- unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */
int readdir_ra; /* readahead inode in readdir */
block_t user_block_count; /* # of user blocks */
@@ -1219,6 +1238,9 @@ struct f2fs_sb_info {
block_t reserved_blocks; /* configurable reserved blocks */
block_t current_reserved_blocks; /* current reserved blocks */
+ /* Additional tracking for no checkpoint mode */
+ block_t unusable_block_count; /* # of blocks saved by last cp */
+
unsigned int nquota_files; /* # of quota sysfile */
u32 s_next_generation; /* for NFS support */
@@ -1257,6 +1279,7 @@ struct f2fs_sb_info {
*/
#ifdef CONFIG_F2FS_STAT_FS
struct f2fs_stat_info *stat_info; /* FS status information */
+ atomic_t meta_count[META_MAX]; /* # of meta blocks */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
@@ -1272,6 +1295,8 @@ struct f2fs_sb_info {
atomic_t max_aw_cnt; /* max # of atomic writes */
atomic_t max_vw_cnt; /* max # of volatile writes */
int bg_gc; /* background gc calls */
+ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
+ unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
spinlock_t stat_lock; /* lock for stat operations */
@@ -1306,9 +1331,9 @@ struct f2fs_sb_info {
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(type) \
- printk("%sF2FS-fs : inject %s in %s of %pF\n", \
- KERN_INFO, f2fs_fault_name[type], \
+#define f2fs_show_injection_info(type) \
+ printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \
+ KERN_INFO, f2fs_fault_name[type], \
__func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
@@ -1344,7 +1369,15 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
- sbi->last_time[type] = jiffies;
+ unsigned long now = jiffies;
+
+ sbi->last_time[type] = now;
+
+ /* DISCARD_TIME and GC_TIME are based on REQ_TIME */
+ if (type == REQ_TIME) {
+ sbi->last_time[DISCARD_TIME] = now;
+ sbi->last_time[GC_TIME] = now;
+ }
}
static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
@@ -1354,16 +1387,18 @@ static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
return time_after(jiffies, sbi->last_time[type] + interval);
}
-static inline bool is_idle(struct f2fs_sb_info *sbi)
+static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi,
+ int type)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
+ unsigned long interval = sbi->interval_time[type] * HZ;
+ unsigned int wait_ms = 0;
+ long delta;
- if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
- return false;
+ delta = (sbi->last_time[type] + interval) - jiffies;
+ if (delta > 0)
+ wait_ms = jiffies_to_msecs(delta);
- return f2fs_time_over(sbi, REQ_TIME);
+ return wait_ms;
}
/*
@@ -1704,7 +1739,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
if (!__allow_reserved_blocks(sbi, inode, true))
avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
-
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ avail_user_block_count -= sbi->unusable_block_count;
if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
diff = sbi->total_valid_block_count - avail_user_block_count;
if (diff > *count)
@@ -1755,7 +1791,9 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
atomic_inc(&sbi->nr_pages[count_type]);
if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
- count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
+ count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA ||
+ count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE ||
+ count_type == F2FS_RD_META)
return;
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -1891,12 +1929,18 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
{
block_t valid_block_count;
unsigned int valid_node_count;
- bool quota = inode && !is_inode;
+ int err;
- if (quota) {
- int ret = dquot_reserve_block(inode, 1);
- if (ret)
- return ret;
+ if (is_inode) {
+ if (inode) {
+ err = dquot_alloc_inode(inode);
+ if (err)
+ return err;
+ }
+ } else {
+ err = dquot_reserve_block(inode, 1);
+ if (err)
+ return err;
}
if (time_to_inject(sbi, FAULT_BLOCK)) {
@@ -1911,6 +1955,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
if (!__allow_reserved_blocks(sbi, inode, false))
valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ valid_block_count += sbi->unusable_block_count;
if (unlikely(valid_block_count > sbi->user_block_count)) {
spin_unlock(&sbi->stat_lock);
@@ -1938,8 +1984,12 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
return 0;
enospc:
- if (quota)
+ if (is_inode) {
+ if (inode)
+ dquot_free_inode(inode);
+ } else {
dquot_release_reservation_block(inode, 1);
+ }
return -ENOSPC;
}
@@ -1960,7 +2010,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
spin_unlock(&sbi->stat_lock);
- if (!is_inode)
+ if (is_inode)
+ dquot_free_inode(inode);
+ else
f2fs_i_blocks_write(inode, 1, false, true);
}
@@ -2090,6 +2142,15 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
return bio_alloc(GFP_KERNEL, npages);
}
+static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+{
+ if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
+ get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
+ get_pages(sbi, F2FS_WB_CP_DATA))
+ return false;
+ return f2fs_time_over(sbi, type);
+}
+
static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
unsigned long index, void *item)
{
@@ -2739,7 +2800,8 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi,
*/
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void f2fs_truncate_data_blocks(struct dnode_of_data *dn);
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock,
+ bool buf_write);
int f2fs_truncate(struct inode *inode);
int f2fs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
@@ -2749,6 +2811,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
int f2fs_precache_extents(struct inode *inode);
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
int f2fs_pin_file_control(struct inode *inode, bool inc);
/*
@@ -2827,6 +2890,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
+int f2fs_quota_sync(struct super_block *sb, int type);
void f2fs_quota_off_umount(struct super_block *sb);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
@@ -2869,7 +2933,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
struct page *f2fs_get_node_page_ra(struct page *parent, int start);
-void f2fs_move_node_page(struct page *node_page, int gc_type);
+int f2fs_move_node_page(struct page *node_page, int gc_type);
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
unsigned int *seq_id);
@@ -2886,7 +2950,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum);
-void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_node_manager_caches(void);
@@ -2914,6 +2978,8 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
+void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi);
+int f2fs_disable_cp_again(struct f2fs_sb_info *sbi);
void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
@@ -2942,7 +3008,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_io_info *fio, bool add_list);
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered);
-void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
+void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
+ block_t len);
void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
@@ -3002,8 +3070,8 @@ int f2fs_init_post_read_processing(void);
void f2fs_destroy_post_read_processing(void);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, nid_t ino, pgoff_t idx,
- enum page_type type);
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type);
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
int f2fs_submit_page_bio(struct f2fs_io_info *fio);
void f2fs_submit_page_write(struct f2fs_io_info *fio);
@@ -3025,6 +3093,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
struct page *f2fs_get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size);
int f2fs_do_write_data_page(struct f2fs_io_info *fio);
+void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3039,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
-void f2fs_clear_radix_tree_dirty_tag(struct page *page);
+void f2fs_clear_page_cache_dirty_tag(struct page *page);
/*
* gc.c
@@ -3077,6 +3146,8 @@ struct f2fs_stat_info {
int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_rd_data, nr_rd_node, nr_rd_meta;
+ unsigned int io_skip_bggc, other_skip_bggc;
int nr_flushing, nr_flushed, flush_list_empty;
int nr_discarding, nr_discarded;
int nr_discard_cmd;
@@ -3098,6 +3169,7 @@ struct f2fs_stat_info {
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
+ unsigned int meta_count[META_MAX];
unsigned int segment_count[2];
unsigned int block_count[2];
unsigned int inplace_count;
@@ -3113,6 +3185,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
+#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++)
+#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
@@ -3149,6 +3223,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (f2fs_has_inline_dentry(inode)) \
(atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \
} while (0)
+#define stat_inc_meta_count(sbi, blkaddr) \
+ do { \
+ if (blkaddr < SIT_I(sbi)->sit_base_addr) \
+ atomic_inc(&(sbi)->meta_count[META_CP]); \
+ else if (blkaddr < NM_I(sbi)->nat_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_SIT]); \
+ else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_NAT]); \
+ else if (blkaddr < SM_I(sbi)->main_blkaddr) \
+ atomic_inc(&(sbi)->meta_count[META_SSA]); \
+ } while (0)
#define stat_inc_seg_type(sbi, curseg) \
((sbi)->segment_count[(curseg)->alloc_type]++)
#define stat_inc_block_count(sbi, curseg) \
@@ -3218,6 +3303,8 @@ void f2fs_destroy_root_stats(void);
#define stat_inc_bg_cp_count(si) do { } while (0)
#define stat_inc_call_count(si) do { } while (0)
#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_io_skip_bggc_count(sbi) do { } while (0)
+#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
#define stat_dec_dirty_inode(sbi, type) do { } while (0)
#define stat_inc_total_hit(sb) do { } while (0)
@@ -3236,6 +3323,7 @@ void f2fs_destroy_root_stats(void);
#define stat_inc_volatile_write(inode) do { } while (0)
#define stat_dec_volatile_write(inode) do { } while (0)
#define stat_update_max_volatile_write(inode) do { } while (0)
+#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
#define stat_inc_block_count(sbi, curseg) do { } while (0)
#define stat_inc_inplace_blocks(sbi) do { } while (0)
@@ -3305,18 +3393,19 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
-struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root,
+struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs);
struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
- struct rb_root *root, struct rb_node **parent,
- unsigned int ofs);
-struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_root_cached *root,
+ struct rb_node **parent,
+ unsigned int ofs, bool *leftmost);
+struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs,
struct rb_entry **prev_entry, struct rb_entry **next_entry,
struct rb_node ***insert_p, struct rb_node **insert_parent,
- bool force);
+ bool force, bool *leftmost);
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
- struct rb_root *root);
+ struct rb_root_cached *root);
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
void f2fs_drop_extent_tree(struct inode *inode);
@@ -3356,7 +3445,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
file_set_encrypt(inode);
- inode->i_flags |= S_ENCRYPTED;
+ f2fs_set_inode_flags(inode);
#endif
}
@@ -3384,6 +3473,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO);
F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME);
F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
+F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
#ifdef CONFIG_BLK_DEV_ZONED
static inline int get_blkz_type(struct f2fs_sb_info *sbi,
@@ -3399,11 +3489,20 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi,
}
#endif
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
{
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+ return f2fs_sb_has_blkzoned(sbi->sb);
+}
- return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb);
+static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
+{
+ return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev));
+}
+
+static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi)
+{
+ return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) ||
+ f2fs_hw_should_discard(sbi);
}
static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -3432,11 +3531,50 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#endif
}
-static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
+static inline int block_unaligned_IO(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
{
- return (f2fs_post_read_required(inode) ||
- (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
- F2FS_I_SB(inode)->s_ndevs);
+ unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
+ unsigned int blocksize_mask = (1 << i_blkbits) - 1;
+ loff_t offset = iocb->ki_pos;
+ unsigned long align = offset | iov_iter_alignment(iter);
+
+ return align & blocksize_mask;
+}
+
+static inline int allow_outplace_dio(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int rw = iov_iter_rw(iter);
+
+ return (test_opt(sbi, LFS) && (rw == WRITE) &&
+ !block_unaligned_IO(inode, iocb, iter));
+}
+
+static inline bool f2fs_force_buffered_io(struct inode *inode,
+ struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int rw = iov_iter_rw(iter);
+
+ if (f2fs_post_read_required(inode))
+ return true;
+ if (sbi->s_ndevs)
+ return true;
+ /*
+ * for blkzoned device, fallback direct IO to buffered IO, so
+ * all IOs can be serialized by log-structured write.
+ */
+ if (f2fs_sb_has_blkzoned(sbi->sb))
+ return true;
+ if (test_opt(sbi, LFS) && (rw == WRITE) &&
+ block_unaligned_IO(inode, iocb, iter))
+ return true;
+ if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
+ return true;
+
+ return false;
}
#ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -3447,3 +3585,16 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
#endif
#endif
+
+static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
+{
+#ifdef CONFIG_QUOTA
+ if (f2fs_sb_has_quota_ino(sbi->sb))
+ return true;
+ if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
+ return true;
+#endif
+ return false;
+}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5474aaa274b9..88b124677189 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fs/f2fs/file.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/fs.h>