aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c10
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_super.c15
-rw-r--r--fs/affs/affs.h4
-rw-r--r--fs/affs/amigaffs.h144
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c10
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c87
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/rxrpc.c12
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/volume.c8
-rw-r--r--fs/autofs4/Kconfig2
-rw-r--r--fs/befs/linuxvfs.c15
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/block_dev.c156
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/btrfs_inode.h7
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c29
-rw-r--r--fs/btrfs/ctree.h35
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/delayed-inode.h6
-rw-r--r--fs/btrfs/delayed-ref.c8
-rw-r--r--fs/btrfs/delayed-ref.h8
-rw-r--r--fs/btrfs/dev-replace.c9
-rw-r--r--fs/btrfs/disk-io.c49
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c35
-rw-r--r--fs/btrfs/extent_io.c59
-rw-r--r--fs/btrfs/extent_io.h8
-rw-r--r--fs/btrfs/extent_map.c10
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c82
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/free-space-tree.c3
-rw-r--r--fs/btrfs/inode.c289
-rw-r--r--fs/btrfs/ioctl.c42
-rw-r--r--fs/btrfs/ordered-data.c20
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/qgroup.c113
-rw-r--r--fs/btrfs/qgroup.h51
-rw-r--r--fs/btrfs/raid56.c38
-rw-r--r--fs/btrfs/reada.c37
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/scrub.c331
-rw-r--r--fs/btrfs/send.c50
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/transaction.c48
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c854
-rw-r--r--fs/btrfs/volumes.h8
-rw-r--r--fs/buffer.c32
-rw-r--r--fs/ceph/addr.c16
-rw-r--r--fs/ceph/caps.c25
-rw-r--r--fs/ceph/debugfs.c23
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/file.c77
-rw-r--r--fs/ceph/inode.c39
-rw-r--r--fs/ceph/mds_client.c79
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/mdsmap.c44
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c42
-rw-r--r--fs/ceph/super.h33
-rw-r--r--fs/ceph/xattr.c3
-rw-r--r--fs/char_dev.c86
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_unicode.c6
-rw-r--r--fs/cifs/cifs_unicode.h5
-rw-r--r--fs/cifs/cifsencrypt.c4
-rw-r--r--fs/cifs/cifsfs.c23
-rw-r--r--fs/cifs/cifsglob.h20
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c13
-rw-r--r--fs/cifs/connect.c34
-rw-r--r--fs/cifs/file.c357
-rw-r--r--fs/cifs/inode.c28
-rw-r--r--fs/cifs/ioctl.c7
-rw-r--r--fs/cifs/misc.c136
-rw-r--r--fs/cifs/netmisc.c6
-rw-r--r--fs/cifs/smb1ops.c10
-rw-r--r--fs/cifs/smb2misc.c5
-rw-r--r--fs/cifs/smb2ops.c1
-rw-r--r--fs/cifs/smb2pdu.c15
-rw-r--r--fs/cifs/smb2transport.c30
-rw-r--r--fs/cifs/transport.c32
-rw-r--r--fs/coda/inode.c11
-rw-r--r--fs/compat.c1191
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/crypto/fname.c90
-rw-r--r--fs/crypto/fscrypt_private.h13
-rw-r--r--fs/crypto/keyinfo.c3
-rw-r--r--fs/crypto/policy.c98
-rw-r--r--fs/dax.c337
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/main.c4
-rw-r--r--fs/eventpoll.c93
-rw-r--r--fs/exec.c1
-rw-r--r--fs/exofs/dir.c3
-rw-r--r--fs/exofs/exofs.h1
-rw-r--r--fs/exofs/super.c17
-rw-r--r--fs/ext2/ext2.h4
-rw-r--r--fs/ext2/inode.c31
-rw-r--r--fs/ext2/ioctl.c1
-rw-r--r--fs/ext2/super.c87
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/ext4.h14
-rw-r--r--fs/ext4/fsmap.c722
-rw-r--r--fs/ext4/fsmap.h69
-rw-r--r--fs/ext4/ialloc.c17
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c75
-rw-r--r--fs/ext4/ioctl.c94
-rw-r--r--fs/ext4/mballoc.c53
-rw-r--r--fs/ext4/mballoc.h17
-rw-r--r--fs/ext4/namei.c129
-rw-r--r--fs/ext4/page-io.c11
-rw-r--r--fs/ext4/super.c92
-rw-r--r--fs/ext4/sysfs.c8
-rw-r--r--fs/ext4/xattr.c63
-rw-r--r--fs/f2fs/checkpoint.c77
-rw-r--r--fs/f2fs/data.c200
-rw-r--r--fs/f2fs/debug.c62
-rw-r--r--fs/f2fs/dir.c71
-rw-r--r--fs/f2fs/extent_cache.c326
-rw-r--r--fs/f2fs/f2fs.h343
-rw-r--r--fs/f2fs/file.c177
-rw-r--r--fs/f2fs/gc.c93
-rw-r--r--fs/f2fs/hash.c7
-rw-r--r--fs/f2fs/inline.c38
-rw-r--r--fs/f2fs/inode.c23
-rw-r--r--fs/f2fs/namei.c60
-rw-r--r--fs/f2fs/node.c150
-rw-r--r--fs/f2fs/node.h31
-rw-r--r--fs/f2fs/recovery.c8
-rw-r--r--fs/f2fs/segment.c792
-rw-r--r--fs/f2fs/segment.h144
-rw-r--r--fs/f2fs/super.c48
-rw-r--r--fs/f2fs/trace.c4
-rw-r--r--fs/f2fs/xattr.c31
-rw-r--r--fs/f2fs/xattr.h8
-rw-r--r--fs/fcntl.c171
-rw-r--r--fs/fhandle.c13
-rw-r--r--fs/file.c2
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dev.c37
-rw-r--r--fs/fuse/file.c30
-rw-r--r--fs/fuse/fuse_i.h17
-rw-r--r--fs/fuse/inode.c51
-rw-r--r--fs/gfs2/bmap.c741
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c81
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/rgrp.c7
-rw-r--r--fs/gfs2/rgrp.h7
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/hfs/extent.c4
-rw-r--r--fs/hfsplus/extents.c5
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h4
-rw-r--r--fs/iomap.c37
-rw-r--r--fs/jbd2/journal.c39
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.c18
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/super.c79
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/mount.h2
-rw-r--r--fs/namei.c22
-rw-r--r--fs/namespace.c3
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ncp_fs_sb.h1
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/direct.c27
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c13
-rw-r--r--fs/nfsd/blocklayout.c7
-rw-r--r--fs/nfsd/nfs3xdr.c13
-rw-r--r--fs/nfsd/nfs4proc.c2
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/nfssvc.c36
-rw-r--r--fs/nfsd/nfsxdr.c10
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/Makefile4
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify.c26
-rw-r--r--fs/notify/fanotify/fanotify.h1
-rw-r--r--fs/notify/fanotify/fanotify_user.c77
-rw-r--r--fs/notify/fdinfo.c16
-rw-r--r--fs/notify/fsnotify.c107
-rw-r--r--fs/notify/fsnotify.h48
-rw-r--r--fs/notify/group.c20
-rw-r--r--fs/notify/inode_mark.c199
-rw-r--r--fs/notify/inotify/inotify.h4
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c81
-rw-r--r--fs/notify/mark.c642
-rw-r--r--fs/notify/vfsmount_mark.c108
-rw-r--r--fs/nsfs.c5
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/cluster/tcp.c32
-rw-r--r--fs/open.c29
-rw-r--r--fs/orangefs/devorangefs-req.c4
-rw-r--r--fs/orangefs/dir.c640
-rw-r--r--fs/orangefs/downcall.h16
-rw-r--r--fs/orangefs/file.c9
-rw-r--r--fs/orangefs/inode.c19
-rw-r--r--fs/orangefs/namei.c5
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/orangefs/orangefs-debugfs.c3
-rw-r--r--fs/orangefs/orangefs-dev-proto.h7
-rw-r--r--fs/orangefs/orangefs-kernel.h9
-rw-r--r--fs/orangefs/orangefs-utils.c98
-rw-r--r--fs/orangefs/protocol.h9
-rw-r--r--fs/orangefs/super.c28
-rw-r--r--fs/orangefs/waitqueue.c9
-rw-r--r--fs/orangefs/xattr.c26
-rw-r--r--fs/overlayfs/copy_up.c82
-rw-r--r--fs/overlayfs/dir.c37
-rw-r--r--fs/overlayfs/inode.c103
-rw-r--r--fs/overlayfs/namei.c141
-rw-r--r--fs/overlayfs/overlayfs.h41
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/super.c40
-rw-r--r--fs/overlayfs/util.c19
-rw-r--r--fs/proc/base.c20
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/namespaces.c1
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/pstore/ftrace.c11
-rw-r--r--fs/pstore/inode.c147
-rw-r--r--fs/pstore/internal.h8
-rw-r--r--fs/pstore/platform.c301
-rw-r--r--fs/pstore/pmsg.c12
-rw-r--r--fs/pstore/ram.c130
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/quota/dquot.c31
-rw-r--r--fs/read_write.c75
-rw-r--r--fs/readdir.c165
-rw-r--r--fs/reiserfs/inode.c31
-rw-r--r--fs/reiserfs/ioctl.c1
-rw-r--r--fs/reiserfs/item_ops.c24
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/lbalance.c2
-rw-r--r--fs/reiserfs/reiserfs.h3
-rw-r--r--fs/reiserfs/super.c92
-rw-r--r--fs/select.c442
-rw-r--r--fs/seq_file.c16
-rw-r--r--fs/signalfd.c2
-rw-r--r--fs/splice.c20
-rw-r--r--fs/stat.c97
-rw-r--r--fs/statfs.c140
-rw-r--r--fs/super.c53
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/debug.c10
-rw-r--r--fs/ubifs/dir.c41
-rw-r--r--fs/ubifs/file.c12
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/misc.h10
-rw-r--r--fs/ubifs/sb.c14
-rw-r--r--fs/ubifs/super.c25
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/file.c10
-rw-r--r--fs/udf/inode.c22
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/utimes.c66
-rw-r--r--fs/xattr.c27
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/kmem.c14
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c57
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c172
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c354
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h14
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c43
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h22
-rw-r--r--fs/xfs/libxfs/xfs_btree.c17
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c7
-rw-r--r--fs/xfs/libxfs/xfs_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_fs.h13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c90
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c56
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c70
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h23
-rw-r--r--fs/xfs/xfs_aops.c18
-rw-r--r--fs/xfs/xfs_bmap_item.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c22
-rw-r--r--fs/xfs/xfs_buf.c32
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c15
-rw-r--r--fs/xfs/xfs_discard.c10
-rw-r--r--fs/xfs/xfs_extfree_item.c1
-rw-r--r--fs/xfs/xfs_fsmap.c940
-rw-r--r--fs/xfs/xfs_fsmap.h53
-rw-r--r--fs/xfs/xfs_icache.c58
-rw-r--r--fs/xfs/xfs_icache.h8
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c29
-rw-r--r--fs/xfs/xfs_ioctl.c89
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_linux.h85
-rw-r--r--fs/xfs/xfs_log.c4
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_qm.c11
-rw-r--r--fs/xfs/xfs_qm_syscalls.c3
-rw-r--r--fs/xfs/xfs_refcount_item.c1
-rw-r--r--fs/xfs/xfs_reflink.c39
-rw-r--r--fs/xfs/xfs_rmap_item.c1
-rw-r--r--fs/xfs/xfs_rtalloc.h22
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trace.h144
-rw-r--r--fs/xfs/xfs_trans.c51
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_ail.c71
-rw-r--r--fs/xfs/xfs_trans_priv.h15
339 files changed, 12413 insertions, 7020 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a89f3cfe3c7d..c202930086ed 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
goto err_names;
init_rwsem(&v9ses->rename_sem);
- rc = bdi_setup_and_register(&v9ses->bdi, "9p");
- if (rc)
- goto err_names;
-
v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(v9ses->clnt)) {
rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
- goto err_bdi;
+ goto err_names;
}
v9ses->flags = V9FS_ACCESS_USER;
@@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
err_clnt:
p9_client_destroy(v9ses->clnt);
-err_bdi:
- bdi_destroy(&v9ses->bdi);
err_names:
kfree(v9ses->uname);
kfree(v9ses->aname);
@@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
kfree(v9ses->uname);
kfree(v9ses->aname);
- bdi_destroy(&v9ses->bdi);
-
spin_lock(&v9fs_sessionlist_lock);
list_del(&v9ses->slist);
spin_unlock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 443d12e02043..76eaf49abd3a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -114,7 +114,6 @@ struct v9fs_session_info {
kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
- struct backing_dev_info bdi;
struct rw_semaphore rename_sem;
};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed8629196..a0965fb587a5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
*
*/
-static void
+static int
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
int flags, void *data)
{
+ int ret;
+
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_xattr = v9fs_xattr_handlers;
} else
sb->s_op = &v9fs_super_ops;
- sb->s_bdi = &v9ses->bdi;
+
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
#endif
save_mount_options(sb, data);
+ return 0;
}
/**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto clunk_fid;
}
- v9fs_fill_super(sb, v9ses, flags, data);
+ retval = v9fs_fill_super(sb, v9ses, flags, data);
+ if (retval)
+ goto release_sb;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
sb->s_d_op = &v9fs_cached_dentry_operations;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 2f8bab390d13..773749be8290 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -7,7 +7,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/amigaffs.h>
+#include "amigaffs.h"
#include <linux/mutex.h>
#include <linux/workqueue.h>
@@ -173,7 +173,7 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir,
struct dentry *dentry);
extern int affs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname);
-extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
new file mode 100644
index 000000000000..43b41c06aa37
--- /dev/null
+++ b/fs/affs/amigaffs.h
@@ -0,0 +1,144 @@
+#ifndef AMIGAFFS_H
+#define AMIGAFFS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define FS_OFS 0x444F5300
+#define FS_FFS 0x444F5301
+#define FS_INTLOFS 0x444F5302
+#define FS_INTLFFS 0x444F5303
+#define FS_DCOFS 0x444F5304
+#define FS_DCFFS 0x444F5305
+#define MUFS_FS 0x6d754653 /* 'muFS' */
+#define MUFS_OFS 0x6d754600 /* 'muF\0' */
+#define MUFS_FFS 0x6d754601 /* 'muF\1' */
+#define MUFS_INTLOFS 0x6d754602 /* 'muF\2' */
+#define MUFS_INTLFFS 0x6d754603 /* 'muF\3' */
+#define MUFS_DCOFS 0x6d754604 /* 'muF\4' */
+#define MUFS_DCFFS 0x6d754605 /* 'muF\5' */
+
+#define T_SHORT 2
+#define T_LIST 16
+#define T_DATA 8
+
+#define ST_LINKFILE -4
+#define ST_FILE -3
+#define ST_ROOT 1
+#define ST_USERDIR 2
+#define ST_SOFTLINK 3
+#define ST_LINKDIR 4
+
+#define AFFS_ROOT_BMAPS 25
+
+struct affs_date {
+ __be32 days;
+ __be32 mins;
+ __be32 ticks;
+};
+
+struct affs_short_date {
+ __be16 days;
+ __be16 mins;
+ __be16 ticks;
+};
+
+struct affs_root_head {
+ __be32 ptype;
+ __be32 spare1;
+ __be32 spare2;
+ __be32 hash_size;
+ __be32 spare3;
+ __be32 checksum;
+ __be32 hashtable[1];
+};
+
+struct affs_root_tail {
+ __be32 bm_flag;
+ __be32 bm_blk[AFFS_ROOT_BMAPS];
+ __be32 bm_ext;
+ struct affs_date root_change;
+ u8 disk_name[32];
+ __be32 spare1;
+ __be32 spare2;
+ struct affs_date disk_change;
+ struct affs_date disk_create;
+ __be32 spare3;
+ __be32 spare4;
+ __be32 dcache;
+ __be32 stype;
+};
+
+struct affs_head {
+ __be32 ptype;
+ __be32 key;
+ __be32 block_count;
+ __be32 spare1;
+ __be32 first_data;
+ __be32 checksum;
+ __be32 table[1];
+};
+
+struct affs_tail {
+ __be32 spare1;
+ __be16 uid;
+ __be16 gid;
+ __be32 protect;
+ __be32 size;
+ u8 comment[92];
+ struct affs_date change;
+ u8 name[32];
+ __be32 spare2;
+ __be32 original;
+ __be32 link_chain;
+ __be32 spare[5];
+ __be32 hash_chain;
+ __be32 parent;
+ __be32 extension;
+ __be32 stype;
+};
+
+struct slink_front
+{
+ __be32 ptype;
+ __be32 key;
+ __be32 spare1[3];
+ __be32 checksum;
+ u8 symname[1]; /* depends on block size */
+};
+
+struct affs_data_head
+{
+ __be32 ptype;
+ __be32 key;
+ __be32 sequence;
+ __be32 size;
+ __be32 next;
+ __be32 checksum;
+ u8 data[1]; /* depends on block size */
+};
+
+/* Permission bits */
+
+#define FIBF_OTR_READ 0x8000
+#define FIBF_OTR_WRITE 0x4000
+#define FIBF_OTR_EXECUTE 0x2000
+#define FIBF_OTR_DELETE 0x1000
+#define FIBF_GRP_READ 0x0800
+#define FIBF_GRP_WRITE 0x0400
+#define FIBF_GRP_EXECUTE 0x0200
+#define FIBF_GRP_DELETE 0x0100
+
+#define FIBF_HIDDEN 0x0080
+#define FIBF_SCRIPT 0x0040
+#define FIBF_PURE 0x0020 /* no use under linux */
+#define FIBF_ARCHIVED 0x0010 /* never set, always cleared on write */
+#define FIBF_NOREAD 0x0008 /* 0 means allowed */
+#define FIBF_NOWRITE 0x0004 /* 0 means allowed */
+#define FIBF_NOEXECUTE 0x0002 /* 0 means allowed, ignored under linux */
+#define FIBF_NODELETE 0x0001 /* 0 means allowed */
+
+#define FIBF_OWNER 0x000F /* Bits pertaining to owner */
+#define FIBF_MASK 0xEE0E /* Bits modified by Linux */
+
+#endif
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1e7294381c5..591ecd7f3063 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -35,7 +35,7 @@ const struct inode_operations affs_dir_inode_operations = {
.symlink = affs_symlink,
.mkdir = affs_mkdir,
.rmdir = affs_rmdir,
- .rename = affs_rename,
+ .rename = affs_rename2,
.setattr = affs_notify_change,
};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0deec9cc2362..196ee7f6fdc4 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -499,7 +499,7 @@ affs_getemptyblk_ino(struct inode *inode, int block)
}
static int
-affs_do_readpage_ofs(struct page *page, unsigned to)
+affs_do_readpage_ofs(struct page *page, unsigned to, int create)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
@@ -518,7 +518,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
boff = tmp % bsize;
while (pos < to) {
- bh = affs_bread_ino(inode, bidx, 0);
+ bh = affs_bread_ino(inode, bidx, create);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
@@ -620,7 +620,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
memset(page_address(page) + to, 0, PAGE_SIZE - to);
}
- err = affs_do_readpage_ofs(page, to);
+ err = affs_do_readpage_ofs(page, to, 0);
if (!err)
SetPageUptodate(page);
unlock_page(page);
@@ -657,7 +657,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_SIZE, 1);
if (err) {
unlock_page(page);
put_page(page);
@@ -679,7 +679,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
int written;
from = pos & (PAGE_SIZE - 1);
- to = pos + len;
+ to = from + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
* we don't have to, because the page should always be uptodate here,
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index abcc59899229..fd4ef3c40e40 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &affs_file_operations;
break;
case ST_SOFTLINK:
+ inode->i_size = strlen((char *)AFFS_HEAD(bh)->table);
inode->i_mode |= S_IFLNK;
inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 96dd1d09a273..46d3ace6761d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -365,6 +365,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
symname++;
}
*p = 0;
+ inode->i_size = i + 1;
mark_buffer_dirty_inode(bh, inode);
affs_brelse(bh);
mark_inode_dirty(inode);
@@ -393,21 +394,14 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
}
-int
+static int
affs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ struct inode *new_dir, struct dentry *new_dentry)
{
struct super_block *sb = old_dir->i_sb;
struct buffer_head *bh = NULL;
int retval;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
- old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
-
retval = affs_check_name(new_dentry->d_name.name,
new_dentry->d_name.len,
affs_nofilenametruncate(old_dentry));
@@ -447,6 +441,76 @@ done:
return retval;
}
+static int
+affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+
+ struct super_block *sb = old_dir->i_sb;
+ struct buffer_head *bh_old = NULL;
+ struct buffer_head *bh_new = NULL;
+ int retval;
+
+ bh_old = affs_bread(sb, d_inode(old_dentry)->i_ino);
+ if (!bh_old)
+ return -EIO;
+
+ bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
+ if (!bh_new)
+ return -EIO;
+
+ /* Remove old header from its parent directory. */
+ affs_lock_dir(old_dir);
+ retval = affs_remove_hash(old_dir, bh_old);
+ affs_unlock_dir(old_dir);
+ if (retval)
+ goto done;
+
+ /* Remove new header from its parent directory. */
+ affs_lock_dir(new_dir);
+ retval = affs_remove_hash(new_dir, bh_new);
+ affs_unlock_dir(new_dir);
+ if (retval)
+ goto done;
+
+ /* Insert old into the new directory with the new name. */
+ affs_copy_name(AFFS_TAIL(sb, bh_old)->name, new_dentry);
+ affs_fix_checksum(sb, bh_old);
+ affs_lock_dir(new_dir);
+ retval = affs_insert_hash(new_dir, bh_old);
+ affs_unlock_dir(new_dir);
+
+ /* Insert new into the old directory with the old name. */
+ affs_copy_name(AFFS_TAIL(sb, bh_new)->name, old_dentry);
+ affs_fix_checksum(sb, bh_new);
+ affs_lock_dir(old_dir);
+ retval = affs_insert_hash(old_dir, bh_new);
+ affs_unlock_dir(old_dir);
+done:
+ mark_buffer_dirty_inode(bh_old, new_dir);
+ mark_buffer_dirty_inode(bh_new, old_dir);
+ affs_brelse(bh_old);
+ affs_brelse(bh_new);
+ return retval;
+}
+
+int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
+ old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
+
+ if (flags & RENAME_EXCHANGE)
+ return affs_xrename(old_dir, old_dentry, new_dir, new_dentry);
+
+ return affs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static struct dentry *affs_get_parent(struct dentry *child)
{
struct inode *parent;
@@ -477,11 +541,6 @@ static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
-
return inode;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6901360fb81..393672997cc2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -318,7 +318,6 @@ struct afs_volume {
unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
struct rw_semaphore server_sem; /* lock for accessing current server */
- struct backing_dev_info bdi;
};
/*
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8f76b13d5549..d5990eb160bd 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -419,7 +419,7 @@ error_do_abort:
call->state = AFS_CALL_COMPLETE;
if (ret != -ECONNABORTED) {
rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT,
- -ret, "KSD");
+ ret, "KSD");
} else {
abort_code = 0;
offset = 0;
@@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENOTCONN:
abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- abort_code, -ret, "KNC");
+ abort_code, ret, "KNC");
goto save_error;
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- abort_code, -ret, "KIV");
+ abort_code, ret, "KIV");
goto save_error;
case -ENODATA:
case -EBADMSG:
@@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call)
if (call->state != AFS_CALL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- abort_code, EBADMSG, "KUM");
+ abort_code, -EBADMSG, "KUM");
goto save_error;
}
}
@@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- RX_USER_ABORT, ENOMEM, "KOO");
+ RX_USER_ABORT, -ENOMEM, "KOO");
default:
_leave(" [error]");
return;
@@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- RX_USER_ABORT, ENOMEM, "KOO");
+ RX_USER_ABORT, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fbdb022b75a2..c79633e5cfd8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
- sb->s_bdi = &as->volume->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
/* allocate the root inode and dentry */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 546f9d01710b..db73d6dad02b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
- volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE;
- ret = bdi_setup_and_register(&volume->bdi, "afs");
- if (ret)
- goto error_bdi;
-
init_rwsem(&volume->server_sem);
/* look up all the applicable server records */
@@ -156,8 +151,6 @@ error:
return ERR_PTR(ret);
error_discard:
- bdi_destroy(&volume->bdi);
-error_bdi:
up_write(&params->cell->vl_sem);
for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume)
for (loop = volume->nservers - 1; loop >= 0; loop--)
afs_put_server(volume->servers[loop]);
- bdi_destroy(&volume->bdi);
kfree(volume);
_leave(" [destroyed]");
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
index 1204d6384d39..44727bf18297 100644
--- a/fs/autofs4/Kconfig
+++ b/fs/autofs4/Kconfig
@@ -7,7 +7,7 @@ config AUTOFS4_FS
automounter (amd), which is a pure user space daemon.
To use the automounter you need the user-space tools from
- <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+ <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also
want to answer Y to "NFS file system support", below.
To compile this support as a module, choose M here: the module will be
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c500e954debb..63e7c4760bfb 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
+static struct dentry *befs_get_parent(struct dentry *child);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
@@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = {
static const struct export_operations befs_export_operations = {
.fh_to_dentry = befs_fh_to_dentry,
.fh_to_parent = befs_fh_to_parent,
+ .get_parent = befs_get_parent,
};
/*
@@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb,
befs_nfs_get_inode);
}
+static struct dentry *befs_get_parent(struct dentry *child)
+{
+ struct inode *parent;
+ struct befs_inode_info *befs_ino = BEFS_I(d_inode(child));
+
+ parent = befs_iget(child->d_sb,
+ (unsigned long)befs_ino->i_parent.start);
+ if (IS_ERR(parent))
+ return ERR_CAST(parent);
+
+ return d_obtain_alias(parent);
+}
+
enum {
Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
};
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index bee1a36bc2ec..f4718098ac31 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -818,7 +818,7 @@ static const struct super_operations s_ops = {
static int bm_fill_super(struct super_block *sb, void *data, int silent)
{
int err;
- static struct tree_descr bm_files[] = {
+ static const struct tree_descr bm_files[] = {
[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..2a305c1a2d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
@@ -103,12 +104,11 @@ void invalidate_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0)
- return;
-
- invalidate_bh_lrus();
- lru_add_drain_all(); /* make sure all lru add caches are flushed */
- invalidate_mapping_pages(mapping, 0, -1);
+ if (mapping->nrpages) {
+ invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
+ invalidate_mapping_pages(mapping, 0, -1);
+ }
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But, for the strange corners, lets be cautious
*/
@@ -717,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
-/**
- * bdev_direct_access() - Get the address for directly-accessibly memory
- * @bdev: The device containing the memory
- * @dax: control and output parameters for ->direct_access
- *
- * If a block device is made up of directly addressable memory, this function
- * will tell the caller the PFN and the address of the memory. The address
- * may be directly dereferenced within the kernel without the need to call
- * ioremap(), kmap() or similar. The PFN is suitable for inserting into
- * page tables.
- *
- * Return: negative errno if an error occurs, otherwise the number of bytes
- * accessible at this address.
- */
-long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+ pgoff_t *pgoff)
{
- sector_t sector = dax->sector;
- long avail, size = dax->size;
- const struct block_device_operations *ops = bdev->bd_disk->fops;
+ phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
- /*
- * The device driver is allowed to sleep, in order to make the
- * memory directly accessible.
- */
- might_sleep();
-
- if (size < 0)
- return size;
- if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
- return -EOPNOTSUPP;
- if ((sector + DIV_ROUND_UP(size, 512)) >
- part_nr_sects_read(bdev->bd_part))
- return -ERANGE;
- sector += get_start_sect(bdev);
- if (sector % (PAGE_SIZE / 512))
+ if (pgoff)
+ *pgoff = PHYS_PFN(phys_off);
+ if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
return -EINVAL;
- avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
- if (!avail)
- return -ERANGE;
- if (avail > 0 && avail & ~PAGE_MASK)
- return -ENXIO;
- return min(avail, size);
+ return 0;
}
-EXPORT_SYMBOL_GPL(bdev_direct_access);
+EXPORT_SYMBOL(bdev_dax_pgoff);
/**
* bdev_dax_supported() - Check if the device supports dax for filesystem
@@ -774,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access);
*/
int bdev_dax_supported(struct super_block *sb, int blocksize)
{
- struct blk_dax_ctl dax = {
- .sector = 0,
- .size = PAGE_SIZE,
- };
- int err;
+ struct block_device *bdev = sb->s_bdev;
+ struct dax_device *dax_dev;
+ pgoff_t pgoff;
+ int err, id;
+ void *kaddr;
+ pfn_t pfn;
+ long len;
if (blocksize != PAGE_SIZE) {
vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
return -EINVAL;
}
- err = bdev_direct_access(sb->s_bdev, &dax);
- if (err < 0) {
- switch (err) {
- case -EOPNOTSUPP:
- vfs_msg(sb, KERN_ERR,
- "error: device does not support dax");
- break;
- case -EINVAL:
- vfs_msg(sb, KERN_ERR,
- "error: unaligned partition for dax");
- break;
- default:
- vfs_msg(sb, KERN_ERR,
- "error: dax access failed (%d)", err);
- }
+ err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+ if (err) {
+ vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
return err;
}
- return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
-/**
- * bdev_dax_capable() - Return if the raw device is capable for dax
- * @bdev: The device for raw block device access
- */
-bool bdev_dax_capable(struct block_device *bdev)
-{
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- };
+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ if (!dax_dev) {
+ vfs_msg(sb, KERN_ERR, "error: device does not support dax");
+ return -EOPNOTSUPP;
+ }
- if (!IS_ENABLED(CONFIG_FS_DAX))
- return false;
+ id = dax_read_lock();
+ len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+ dax_read_unlock(id);
- dax.sector = 0;
- if (bdev_direct_access(bdev, &dax) < 0)
- return false;
+ put_dax(dax_dev);
- dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
- if (bdev_direct_access(bdev, &dax) < 0)
- return false;
+ if (len < 1) {
+ vfs_msg(sb, KERN_ERR,
+ "error: dax access failed (%ld)", len);
+ return len < 0 ? len : -EIO;
+ }
- return true;
+ return 0;
}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
/*
* pseudo-fs
@@ -885,6 +837,8 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(&bdev_lock);
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
+ /* Detach inode from wb early as bdi_put() may free bdi->wb */
+ inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
bdi_put(bdev->bd_bdi);
bdev->bd_bdi = &noop_backing_dev_info;
@@ -1451,7 +1405,6 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
- blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1556,8 +1509,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
if (!partno) {
ret = -ENXIO;
@@ -1622,6 +1573,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
+
+ if (bdev->bd_bdi == &noop_backing_dev_info)
+ bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
if (bdev->bd_contains == bdev) {
ret = 0;
@@ -1653,8 +1607,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1876,12 +1828,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
kill_bdev(bdev);
bdev_write_inode(bdev);
- /*
- * Detaching bdev inode from its wb in __destroy_inode()
- * is too late: the queue which embeds its bdi (along with
- * root wb) can be gone as soon as we put_disk() below.
- */
- inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -2074,7 +2020,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct request_queue *q = bdev_get_queue(bdev);
struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
@@ -2110,18 +2055,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, false);
+ GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- /* Only punch if the device can do zeroing discard. */
- if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
- return -EOPNOTSUPP;
- error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, 0);
+ error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, 0);
break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7699e16784d3..24865da63d8f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -26,6 +26,11 @@
#include "delayed-ref.h"
#include "locking.h"
+enum merge_mode {
+ MERGE_IDENTICAL_KEYS = 1,
+ MERGE_IDENTICAL_PARENTS,
+};
+
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* slot==nritems. In that case, go to the next leaf before we continue.
*/
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == (u64)-1)
+ else if (time_seq == SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
0, 0);
else
@@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
/*
* merge backrefs and adjust counts accordingly
*
- * mode = 1: merge identical keys, if key is set
- * FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
- * additionally, we could even add a key range for the blocks we
- * looked into to merge even more (-> replace unresolved refs by those
- * having a parent).
- * mode = 2: merge identical parents
+ * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref
+ * then we can merge more here. Additionally, we could even add a key
+ * range for the blocks we looked into to merge even more (-> replace
+ * unresolved refs by those having a parent).
*/
-static void __merge_refs(struct list_head *head, int mode)
+static void __merge_refs(struct list_head *head, enum merge_mode mode)
{
struct __prelim_ref *pos1;
@@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode)
if (!ref_for_same_block(ref1, ref2))
continue;
- if (mode == 1) {
+ if (mode == MERGE_IDENTICAL_KEYS) {
if (!ref1->parent && ref2->parent)
swap(ref1, ref2);
} else {
@@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
*
* NOTE: This can return values > 0
*
- * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
* much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
@@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == (u64)-1)
+ if (time_seq == SEQ_LAST)
path->skip_locking = 1;
/*
@@ -1273,9 +1276,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != (u64)-1) {
+ time_seq != SEQ_LAST) {
#else
- if (trans && time_seq != (u64)-1) {
+ if (trans && time_seq != SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1286,7 +1289,7 @@ again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -1374,7 +1377,7 @@ again:
if (ret)
goto out;
- __merge_refs(&prefs, 1);
+ __merge_refs(&prefs, MERGE_IDENTICAL_KEYS);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
extent_item_pos, total_refs,
@@ -1382,7 +1385,7 @@ again:
if (ret)
goto out;
- __merge_refs(&prefs, 2);
+ __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS);
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0c6baaba0651..b8622e4d1744 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -125,6 +125,13 @@ struct btrfs_inode {
u64 delalloc_bytes;
/*
+ * Total number of bytes pending delalloc that fall within a file
+ * range that is either a hole or beyond EOF (and no prealloc extent
+ * exists in the range). This is always <= delalloc_bytes.
+ */
+ u64 new_delalloc_bytes;
+
+ /*
* total number of bytes pending defrag, used by stat to check whether
* it needs COW.
*/
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c7721a6aa3bb..10e6b282d09d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -44,7 +44,7 @@
struct compressed_bio {
/* number of bios pending for this compressed extent */
- atomic_t pending_bios;
+ refcount_t pending_bios;
/* the pages with the compressed data on them */
struct page **compressed_pages;
@@ -161,7 +161,7 @@ static void end_compressed_bio_read(struct bio *bio)
/* if there are more bios still pending for this compressed
* extent, just exit
*/
- if (!atomic_dec_and_test(&cb->pending_bios))
+ if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
inode = cb->inode;
@@ -274,7 +274,7 @@ static void end_compressed_bio_write(struct bio *bio)
/* if there are more bios still pending for this compressed
* extent, just exit
*/
- if (!atomic_dec_and_test(&cb->pending_bios))
+ if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
/* ok, we're the last bio for this extent, step one is to
@@ -342,7 +342,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return -ENOMEM;
- atomic_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->start = start;
@@ -363,7 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
- atomic_inc(&cb->pending_bios);
+ refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
bytes_left = compressed_len;
@@ -388,7 +388,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
- atomic_inc(&cb->pending_bios);
+ refcount_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@@ -607,7 +607,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb)
goto out;
- atomic_set(&cb->pending_bios, 0);
+ refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
@@ -656,7 +656,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
- atomic_inc(&cb->pending_bios);
+ refcount_set(&cb->pending_bios, 1);
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
page = cb->compressed_pages[pg_index];
@@ -685,7 +685,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
- atomic_inc(&cb->pending_bios);
+ refcount_inc(&cb->pending_bios);
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7dc8844037e0..a3a75f1de002 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
static noinline int
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
- int nr_items, gfp_t flags)
+ int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
@@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (!tree_mod_need_log(fs_info, eb))
return 0;
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
@@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
static noinline int
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *old_root,
- struct extent_buffer *new_root, gfp_t flags,
+ struct extent_buffer *new_root,
int log_removal)
{
struct tree_mod_elem *tm = NULL;
@@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- flags);
+ GFP_NOFS);
if (!tm_list) {
ret = -ENOMEM;
goto free_tms;
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
@@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
}
}
- tm = kzalloc(sizeof(*tm), flags);
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
@@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
{
int ret;
ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
- nr_items, GFP_NOFS);
+ nr_items);
BUG_ON(ret < 0);
}
@@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root,
{
int ret;
ret = tree_mod_log_insert_root(root->fs_info, root->node,
- new_root_node, GFP_NOFS, log_removal);
+ new_root_node, log_removal);
BUG_ON(ret < 0);
}
@@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
if (!tmp_buf) {
- tmp_buf = vmalloc(fs_info->nodesize);
- if (!tmp_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
left_path->search_commit_root = 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4115901d906..643c70d2b2e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,6 +39,7 @@
#include <linux/security.h>
#include <linux/sizes.h>
#include <linux/dynamic_debug.h>
+#include <linux/refcount.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -518,7 +519,7 @@ struct btrfs_caching_control {
struct btrfs_work work;
struct btrfs_block_group_cache *block_group;
u64 progress;
- atomic_t count;
+ refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
@@ -538,6 +539,14 @@ struct btrfs_io_ctl {
unsigned check_crcs:1;
};
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -648,6 +657,9 @@ struct btrfs_block_group_cache {
* Protected by free_space_lock.
*/
int needs_free_space;
+
+ /* Record locked full stripes for RAID5/6 block group */
+ struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
/* delayed seq elem */
@@ -658,6 +670,8 @@ struct seq_list {
#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define SEQ_LAST ((u64)-1)
+
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -702,6 +716,11 @@ struct btrfs_delayed_root;
#define BTRFS_FS_BTREE_ERR 11
#define BTRFS_FS_LOG1_ERR 12
#define BTRFS_FS_LOG2_ERR 13
+/*
+ * Indicate that a whole-filesystem exclusive operation is running
+ * (device replace, resize, device add/delete, balance)
+ */
+#define BTRFS_FS_EXCL_OP 14
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -810,7 +829,6 @@ struct btrfs_fs_info {
struct btrfs_super_block *super_for_commit;
struct super_block *sb;
struct inode *btree_inode;
- struct backing_dev_info bdi;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
@@ -1067,8 +1085,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- atomic_t mutually_exclusive_operation_running;
-
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
@@ -1221,7 +1237,7 @@ struct btrfs_root {
dev_t anon_dev;
spinlock_t root_item_lock;
- atomic_t refs;
+ refcount_t refs;
struct mutex delalloc_mutex;
spinlock_t delalloc_lock;
@@ -3647,6 +3663,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
+static inline void btrfs_init_full_stripe_locks_tree(
+ struct btrfs_full_stripe_locks_tree *locks_root)
+{
+ locks_root->root = RB_ROOT;
+ mutex_init(&locks_root->lock);
+}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
@@ -3671,8 +3693,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *start, struct btrfs_key *end);
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int err);
+int btree_readahead_hook(struct extent_buffer *eb, int err);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1aff676f0e5b..8ae409b5a61d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node(
{
delayed_node->root = root;
delayed_node->inode_id = inode_id;
- atomic_set(&delayed_node->refs, 0);
+ refcount_set(&delayed_node->refs, 0);
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
@@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = READ_ONCE(btrfs_inode->delayed_node);
if (node) {
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
return node;
}
@@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
if (node) {
if (btrfs_inode->delayed_node) {
- atomic_inc(&node->refs); /* can be accessed */
+ refcount_inc(&node->refs); /* can be accessed */
BUG_ON(btrfs_inode->delayed_node != node);
spin_unlock(&root->inode_lock);
return node;
}
btrfs_inode->delayed_node = node;
/* can be accessed and cached in the inode */
- atomic_add(2, &node->refs);
+ refcount_add(2, &node->refs);
spin_unlock(&root->inode_lock);
return node;
}
@@ -125,7 +125,7 @@ again:
btrfs_init_delayed_node(node, root, ino);
/* cached in the btrfs inode and can be accessed */
- atomic_add(2, &node->refs);
+ refcount_set(&node->refs, 2);
ret = radix_tree_preload(GFP_NOFS);
if (ret) {
@@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
} else {
list_add_tail(&node->n_list, &root->node_list);
list_add_tail(&node->p_list, &root->prepare_list);
- atomic_inc(&node->refs); /* inserted into list */
+ refcount_inc(&node->refs); /* inserted into list */
root->nodes++;
set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
@@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
spin_lock(&root->lock);
if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
- atomic_dec(&node->refs); /* not in the list */
+ refcount_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
list_del_init(&node->p_list);
@@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node(
p = delayed_root->node_list.next;
node = list_entry(p, struct btrfs_delayed_node, n_list);
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
p = node->n_list.next;
next = list_entry(p, struct btrfs_delayed_node, n_list);
- atomic_inc(&next->refs);
+ refcount_inc(&next->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node(
btrfs_dequeue_delayed_node(delayed_root, delayed_node);
mutex_unlock(&delayed_node->mutex);
- if (atomic_dec_and_test(&delayed_node->refs)) {
+ if (refcount_dec_and_test(&delayed_node->refs)) {
bool free = false;
struct btrfs_root *root = delayed_node->root;
spin_lock(&root->inode_lock);
- if (atomic_read(&delayed_node->refs) == 0) {
+ if (refcount_read(&delayed_node->refs) == 0) {
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
free = true;
@@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
p = delayed_root->prepare_list.next;
list_del_init(p);
node = list_entry(p, struct btrfs_delayed_node, p_list);
- atomic_inc(&node->refs);
+ refcount_inc(&node->refs);
out:
spin_unlock(&delayed_root->lock);
@@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
item->ins_or_del = 0;
item->bytes_reserved = 0;
item->delayed_node = NULL;
- atomic_set(&item->refs, 1);
+ refcount_set(&item->refs, 1);
}
return item;
}
@@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
{
if (item) {
__btrfs_remove_delayed_item(item);
- if (atomic_dec_and_test(&item->refs))
+ if (refcount_dec_and_test(&item->refs))
kfree(item);
}
}
@@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
while (item) {
- atomic_inc(&item->refs);
+ refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, ins_list);
item = __btrfs_next_delayed_item(item);
}
item = __btrfs_first_delayed_deletion_item(delayed_node);
while (item) {
- atomic_inc(&item->refs);
+ refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, del_list);
item = __btrfs_next_delayed_item(item);
}
@@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* insert/delete delayed items in this period. So we also needn't
* requeue or dequeue this delayed node.
*/
- atomic_dec(&delayed_node->refs);
+ refcount_dec(&delayed_node->refs);
return true;
}
@@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
}
list_for_each_entry_safe(curr, next, del_list, readdir_list) {
list_del(&curr->readdir_list);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
}
@@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
list_del(&curr->readdir_list);
ret = (curr->key.offset == index);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (ret)
@@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_del(&curr->readdir_list);
if (curr->key.offset < ctx->pos) {
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
@@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
over = !dir_emit(ctx, name, name_len,
location.objectid, d_type);
- if (atomic_dec_and_test(&curr->refs))
+ if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
@@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
inode_id = delayed_nodes[n - 1]->inode_id + 1;
for (i = 0; i < n; i++)
- atomic_inc(&delayed_nodes[i]->refs);
+ refcount_inc(&delayed_nodes[i]->refs);
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 40327cc3b99a..c4189d495934 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -26,7 +26,7 @@
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/atomic.h>
-
+#include <linux/refcount.h>
#include "ctree.h"
/* types of the delayed item */
@@ -67,7 +67,7 @@ struct btrfs_delayed_node {
struct rb_root del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
- atomic_t refs;
+ refcount_t refs;
u64 index_cnt;
unsigned long flags;
int count;
@@ -80,7 +80,7 @@ struct btrfs_delayed_item {
struct list_head readdir_list; /* used for readdir items */
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
- atomic_t refs;
+ refcount_t refs;
int ins_or_del;
u32 data_len;
char data[0];
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6eb80952efb3..be70d90dfee5 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
if (mutex_trylock(&head->mutex))
return 0;
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
@@ -590,7 +590,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = count_mod;
@@ -682,7 +682,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
@@ -739,7 +739,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
seq = atomic64_read(&fs_info->tree_mod_seq);
/* first set the basic ref node struct up */
- atomic_set(&ref->refs, 1);
+ refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 0e537f98f1a1..c0264ff01b53 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -18,6 +18,8 @@
#ifndef __DELAYED_REF__
#define __DELAYED_REF__
+#include <linux/refcount.h>
+
/* these are the possible values of struct btrfs_delayed_ref_node->action */
#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
@@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node {
u64 seq;
/* ref count on this data structure */
- atomic_t refs;
+ refcount_t refs;
/*
* how many refs is this entry adding or deleting. For
@@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
- WARN_ON(atomic_read(&ref->refs) == 0);
- if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(refcount_read(&ref->refs) == 0);
+ if (refcount_dec_and_test(&ref->refs)) {
WARN_ON(ref->in_tree);
switch (ref->type) {
case BTRFS_TREE_BLOCK_REF_KEY:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e653921f05d9..5fe1ca8abc70 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
+ btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return scrub_ret;
@@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
srcdev = dev_replace->srcdev;
- args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+ args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
@@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
}
btrfs_dev_replace_unlock(dev_replace, 1);
- WARN_ON(atomic_xchg(
- &fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
@@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data)
(unsigned int)progress);
}
btrfs_dev_replace_continue_on_mount(fs_info);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eb1ee7b6f532..8685d67185d0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -762,7 +762,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(fs_info, eb, ret);
+ btree_readahead_hook(eb, ret);
if (ret) {
/*
@@ -787,7 +787,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(eb->fs_info, eb, -EIO);
+ btree_readahead_hook(eb, -EIO);
return -EIO; /* we fixed nothing */
}
@@ -1340,7 +1340,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
- atomic_set(&root->refs, 1);
+ refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
atomic64_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
@@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
-{
- int err;
-
- err = bdi_setup_and_register(bdi, "btrfs");
- if (err)
- return err;
-
- bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- bdi->congested_fn = btrfs_congested_fn;
- bdi->congested_data = info;
- bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- return 0;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb,
goto fail;
}
- ret = setup_bdi(fs_info, &fs_info->bdi);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
- goto fail_bdi;
+ goto fail_srcu;
}
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
- sb->s_bdi = &fs_info->bdi;
btrfs_init_btree_inode(fs_info);
@@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
- fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- SZ_4M / PAGE_SIZE);
+ sb->s_bdi->congested_fn = btrfs_congested_fn;
+ sb->s_bdi->congested_data = fs_info;
+ sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+ sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
@@ -3285,8 +3266,6 @@ fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_bdi:
- bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
@@ -3518,10 +3497,11 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static int write_dev_flush(struct btrfs_device *device, int wait)
{
+ struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio;
int ret = 0;
- if (device->nobarriers)
+ if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return 0;
if (wait) {
@@ -4007,7 +3987,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->bio_counter);
- bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
@@ -4343,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
@@ -4615,7 +4594,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
if (t->state >= TRANS_STATE_COMMIT_START) {
- atomic_inc(&t->use_count);
+ refcount_inc(&t->use_count);
spin_unlock(&fs_info->trans_lock);
btrfs_wait_for_commit(fs_info, t->transid);
btrfs_put_transaction(t);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2e0ec29bfd69..21f1ceb85b76 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
*/
static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
{
- if (atomic_inc_not_zero(&root->refs))
+ if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
static inline void btrfs_put_fs_root(struct btrfs_root *root)
{
- if (atomic_dec_and_test(&root->refs))
+ if (refcount_dec_and_test(&root->refs))
kfree(root);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be5477676cc8..e390451c72e6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
+
+ /*
+ * If not empty, someone is still holding mutex of
+ * full_stripe_lock, which can only be released by caller.
+ * And it will definitely cause use-after-free when caller
+ * tries to release full stripe lock.
+ *
+ * No better way to resolve, but only to warn.
+ */
+ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
kfree(cache);
}
@@ -316,14 +326,14 @@ get_caching_control(struct btrfs_block_group_cache *cache)
}
ctl = cache->caching_ctl;
- atomic_inc(&ctl->count);
+ refcount_inc(&ctl->count);
spin_unlock(&cache->lock);
return ctl;
}
static void put_caching_control(struct btrfs_caching_control *ctl)
{
- if (atomic_dec_and_test(&ctl->count))
+ if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
}
@@ -599,7 +609,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
- atomic_set(&caching_ctl->count, 1);
+ refcount_set(&caching_ctl->count, 1);
btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
caching_thread, NULL, NULL);
@@ -620,7 +630,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
struct btrfs_caching_control *ctl;
ctl = cache->caching_ctl;
- atomic_inc(&ctl->count);
+ refcount_inc(&ctl->count);
prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&cache->lock);
@@ -707,7 +717,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
down_write(&fs_info->commit_root_sem);
- atomic_inc(&caching_ctl->count);
+ refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->commit_root_sem);
@@ -892,7 +902,7 @@ search_again:
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -2980,7 +2990,7 @@ again:
struct btrfs_delayed_ref_node *ref;
ref = &head->node;
- atomic_inc(&ref->refs);
+ refcount_inc(&ref->refs);
spin_unlock(&delayed_refs->lock);
/*
@@ -3003,7 +3013,6 @@ again:
goto again;
}
out:
- assert_qgroups_uptodate(trans);
trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3057,7 +3066,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&head->node.refs);
+ refcount_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
@@ -3443,7 +3452,8 @@ again:
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
- * b) we're with nospace_cache mount option.
+ * b) we're with nospace_cache mount option,
+ * c) we're with v2 space_cache (FREE_SPACE_TREE).
*/
dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
mutex_init(&cache->free_space_lock);
+ btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
return cache;
}
@@ -10416,7 +10427,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->caching_block_groups, list)
if (ctl->block_group == block_group) {
caching_ctl = ctl;
- atomic_inc(&caching_ctl->count);
+ refcount_inc(&caching_ctl->count);
break;
}
}
@@ -10850,7 +10861,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
- atomic_inc(&trans->use_count);
+ refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ret = find_free_dev_extent_start(trans, device, minlen, start,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 27fdb250b446..d8da3edf2ac3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void)
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
- atomic_read(&state->refs));
+ refcount_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
@@ -238,7 +238,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
- atomic_set(&state->refs, 1);
+ refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
@@ -248,7 +248,7 @@ void free_extent_state(struct extent_state *state)
{
if (!state)
return;
- if (atomic_dec_and_test(&state->refs)) {
+ if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&state->leak_list);
trace_free_extent_state(state, _RET_IP_);
@@ -641,7 +641,7 @@ again:
if (cached && extent_state_in_tree(cached) &&
cached->start <= start && cached->end > start) {
if (clear)
- atomic_dec(&cached->refs);
+ refcount_dec(&cached->refs);
state = cached;
goto hit_next;
}
@@ -793,7 +793,7 @@ process_node:
if (state->state & bits) {
start = state->start;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
wait_on_state(tree, state);
free_extent_state(state);
goto again;
@@ -834,7 +834,7 @@ static void cache_state_if_flags(struct extent_state *state,
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
*cached_ptr = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
}
}
@@ -1538,7 +1538,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
if (!found) {
*start = state->start;
*cached_state = state;
- atomic_inc(&state->refs);
+ refcount_inc(&state->refs);
}
found++;
*end = state->end;
@@ -2004,16 +2004,11 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
BUG_ON(!mirror_num);
- /* we can't repair anything in raid56 yet */
- if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
- return 0;
-
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
@@ -2026,17 +2021,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bbio, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+ /*
+ * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+ * to update all raid stripes, but here we just want to correct
+ * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+ * stripe's dev and sector.
+ */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &map_length, &bbio, 0);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ ASSERT(bbio->mirror_num == 1);
+ } else {
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+ &map_length, &bbio, mirror_num);
+ if (ret) {
+ btrfs_bio_counter_dec(fs_info);
+ bio_put(bio);
+ return -EIO;
+ }
+ BUG_ON(mirror_num != bbio->mirror_num);
}
- BUG_ON(mirror_num != bbio->mirror_num);
- sector = bbio->stripes[mirror_num-1].physical >> 9;
+
+ sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
- dev = bbio->stripes[mirror_num-1].dev;
+ dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
btrfs_bio_counter_dec(fs_info);
@@ -2859,7 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
@@ -2870,7 +2883,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
*em_cached = em;
}
return em;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3e4fad4a909d..1eafa2f0ede3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include <linux/refcount.h>
#include "ulist.h"
/* bits for the extent state */
@@ -14,14 +15,17 @@
#define EXTENT_DEFRAG (1U << 6)
#define EXTENT_BOUNDARY (1U << 9)
#define EXTENT_NODATASUM (1U << 10)
-#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_CLEAR_META_RESV (1U << 11)
#define EXTENT_FIRST_DELALLOC (1U << 12)
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_CLEAR_DATA_RESV (1U << 17)
+#define EXTENT_DELALLOC_NEW (1U << 18)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/*
@@ -143,7 +147,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
- atomic_t refs;
+ refcount_t refs;
unsigned state;
struct io_failure_record *failrec;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 26f9ac719d20..69850155870c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void)
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
- atomic_set(&em->refs, 1);
+ refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
}
@@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(atomic_read(&em->refs) == 0);
- if (atomic_dec_and_test(&em->refs)) {
+ WARN_ON(refcount_read(&em->refs) == 0);
+ if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
@@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em,
int modified)
{
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
em->mod_start = em->start;
em->mod_len = em->len;
@@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
if (strict && !(end > em->start && start < extent_map_end(em)))
return NULL;
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
return em;
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index eb8b8fae036b..a67b2def5413 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -2,6 +2,7 @@
#define __EXTENTMAP__
#include <linux/rbtree.h>
+#include <linux/refcount.h>
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
@@ -41,7 +42,7 @@ struct extent_map {
*/
struct map_lookup *map_lookup;
};
- atomic_t refs;
+ refcount_t refs;
unsigned int compress_type;
struct list_head list;
};
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 520cb7230b2d..da1096eb1a40 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1404,6 +1404,47 @@ fail:
}
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+ const u64 start,
+ const u64 len,
+ struct extent_state **cached_state)
+{
+ u64 search_start = start;
+ const u64 end = start + len - 1;
+
+ while (search_start < end) {
+ const u64 search_len = end - search_start + 1;
+ struct extent_map *em;
+ u64 em_len;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, search_start,
+ search_len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ if (em->block_start != EXTENT_MAP_HOLE)
+ goto next;
+
+ em_len = em->len;
+ if (em->start < search_start)
+ em_len -= search_start - em->start;
+ if (em_len > search_len)
+ em_len = search_len;
+
+ ret = set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW,
+ NULL, cached_state, GFP_NOFS);
+next:
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
* This function locks the extent and properly waits for data=ordered extents
* to finish before allowing the pages to be modified if need.
@@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
+ round_up(pos + write_bytes - start_pos,
fs_info->sectorsize) - 1;
- if (start_pos < inode->vfs_inode.i_size) {
+ if (start_pos < inode->vfs_inode.i_size ||
+ (inode->flags & BTRFS_INODE_PREALLOC)) {
struct btrfs_ordered_extent *ordered;
+ unsigned int clear_bits;
+
lock_extent_bits(&inode->io_tree, start_pos, last_pos,
cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
-
+ ret = btrfs_find_new_delalloc_bytes(inode, start_pos,
+ last_pos - start_pos + 1,
+ cached_state);
+ clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG;
+ if (ret)
+ clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED;
clear_extent_bit(&inode->io_tree, start_pos,
- last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state, GFP_NOFS);
+ last_pos, clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+ 0, cached_state, GFP_NOFS);
+ if (ret)
+ return ret;
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
@@ -2342,13 +2394,8 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
int ret = 0;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0);
- if (IS_ERR_OR_NULL(em)) {
- if (!em)
- ret = -ENOMEM;
- else
- ret = PTR_ERR(em);
- return ret;
- }
+ if (IS_ERR(em))
+ return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
if (em->block_start == EXTENT_MAP_HOLE) {
@@ -2835,11 +2882,8 @@ static long btrfs_fallocate(struct file *file, int mode,
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- if (IS_ERR_OR_NULL(em)) {
- if (!em)
- ret = -ENOMEM;
- else
- ret = PTR_ERR(em);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
break;
}
last_byte = min(extent_map_end(em), alloc_end);
@@ -2856,8 +2900,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
ret = btrfs_qgroup_reserve_data(inode, cur_offset,
last_byte - cur_offset);
- if (ret < 0)
+ if (ret < 0) {
+ free_extent_map(em);
break;
+ }
} else {
/*
* Do not need to reserve unwritten extent for this
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index da6841efac26..c5e6180cdb8c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
io_ctl->orig = io_ctl->cur;
io_ctl->size = PAGE_SIZE;
if (clear)
- memset(io_ctl->cur, 0, PAGE_SIZE);
+ clear_page(io_ctl->cur);
}
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index dd7fb22a955a..fc0bd8406758 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -167,8 +167,7 @@ static u8 *alloc_bitmap(u32 bitmap_size)
if (mem)
return mem;
- return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL);
}
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e71f1ea3391..17cbe9306faf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -115,6 +115,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
u64 ram_bytes, int compress_type,
int type);
+static void __endio_write_update_ordered(struct inode *inode,
+ const u64 offset, const u64 bytes,
+ const bool uptodate);
+
+/*
+ * Cleanup all submitted ordered extents in specified range to handle errors
+ * from the fill_dellaloc() callback.
+ *
+ * NOTE: caller must ensure that when an error happens, it can not call
+ * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
+ * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
+ * to be released, which we want to happen only when finishing the ordered
+ * extent (btrfs_finish_ordered_io()). Also note that the caller of the
+ * fill_delalloc() callback already does proper cleanup for the first page of
+ * the range, that is, it invokes the callback writepage_end_io_hook() for the
+ * range of the first page.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+ const u64 offset,
+ const u64 bytes)
+{
+ return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
+ bytes - PAGE_SIZE, false);
+}
+
static int btrfs_dirty_inode(struct inode *inode);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -547,7 +572,7 @@ cont:
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
- EXTENT_DEFRAG;
+ EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
unsigned long page_error_op;
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
@@ -565,8 +590,10 @@ cont:
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
- btrfs_free_reserved_data_space_noquota(inode, start,
- end - start + 1);
+ if (ret == 0)
+ btrfs_free_reserved_data_space_noquota(inode,
+ start,
+ end - start + 1);
goto free_pages_out;
}
}
@@ -852,6 +879,7 @@ out_free:
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
@@ -918,10 +946,13 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 disk_num_bytes;
- u64 cur_alloc_size;
+ u64 cur_alloc_size = 0;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
+ unsigned clear_bits;
+ unsigned long page_ops;
+ bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
@@ -944,6 +975,7 @@ static noinline int cow_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode, start, end,
delalloc_end, NULL,
EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
@@ -966,14 +998,14 @@ static noinline int cow_file_range(struct inode *inode,
start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
- unsigned long op;
-
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
fs_info->sectorsize, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
+ cur_alloc_size = ins.offset;
+ extent_reserved = true;
ram_size = ins.offset;
em = create_io_em(inode, start, ins.offset, /* len */
@@ -988,7 +1020,6 @@ static noinline int cow_file_range(struct inode *inode,
goto out_reserve;
free_extent_map(em);
- cur_alloc_size = ins.offset;
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size, 0);
if (ret)
@@ -998,15 +1029,24 @@ static noinline int cow_file_range(struct inode *inode,
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
+ /*
+ * Only drop cache here, and process as normal.
+ *
+ * We must not allow extent_clear_unlock_delalloc()
+ * at out_unlock label to free meta of this ordered
+ * extent, as its meta should be freed by
+ * btrfs_finish_ordered_io().
+ *
+ * So we must continue until @start is increased to
+ * skip current ordered extent.
+ */
if (ret)
- goto out_drop_extent_cache;
+ btrfs_drop_extent_cache(BTRFS_I(inode), start,
+ start + ram_size - 1, 0);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (disk_num_bytes < cur_alloc_size)
- break;
-
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
@@ -1014,18 +1054,30 @@ static noinline int cow_file_range(struct inode *inode,
* Do set the Private2 bit so we know this page was properly
* setup for writepage
*/
- op = unlock ? PAGE_UNLOCK : 0;
- op |= PAGE_SET_PRIVATE2;
+ page_ops = unlock ? PAGE_UNLOCK : 0;
+ page_ops |= PAGE_SET_PRIVATE2;
extent_clear_unlock_delalloc(inode, start,
start + ram_size - 1,
delalloc_end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
- op);
- disk_num_bytes -= cur_alloc_size;
+ page_ops);
+ if (disk_num_bytes < cur_alloc_size)
+ disk_num_bytes = 0;
+ else
+ disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
+ extent_reserved = false;
+
+ /*
+ * btrfs_reloc_clone_csums() error, since start is increased
+ * extent_clear_unlock_delalloc() at out_unlock label won't
+ * free metadata of current ordered extent, we're OK to exit.
+ */
+ if (ret)
+ goto out_unlock;
}
out:
return ret;
@@ -1036,12 +1088,35 @@ out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+ PAGE_END_WRITEBACK;
+ /*
+ * If we reserved an extent for our delalloc range (or a subrange) and
+ * failed to create the respective ordered extent, then it means that
+ * when we reserved the extent we decremented the extent's size from
+ * the data space_info's bytes_may_use counter and incremented the
+ * space_info's bytes_reserved counter by the same amount. We must make
+ * sure extent_clear_unlock_delalloc() does not try to decrement again
+ * the data space_info's bytes_may_use counter, therefore we do not pass
+ * it the flag EXTENT_CLEAR_DATA_RESV.
+ */
+ if (extent_reserved) {
+ extent_clear_unlock_delalloc(inode, start,
+ start + cur_alloc_size,
+ start + cur_alloc_size,
+ locked_page,
+ clear_bits,
+ page_ops);
+ start += cur_alloc_size;
+ if (start >= end)
+ goto out;
+ }
extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
locked_page,
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DELALLOC | EXTENT_DEFRAG,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ clear_bits | EXTENT_CLEAR_DATA_RESV,
+ page_ops);
goto out;
}
@@ -1414,15 +1489,14 @@ out_check:
BUG_ON(ret); /* -ENOMEM */
if (root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ BTRFS_DATA_RELOC_TREE_OBJECTID)
+ /*
+ * Error handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler
+ * from freeing metadata of created ordered extent.
+ */
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- if (ret) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshoting(root);
- goto error;
- }
- }
extent_clear_unlock_delalloc(inode, cur_offset,
cur_offset + num_bytes - 1, end,
@@ -1434,6 +1508,14 @@ out_check:
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
+
+ /*
+ * btrfs_reloc_clone_csums() error, now we're OK to call error
+ * handler, as metadata for created ordered extent will only
+ * be freed by btrfs_finish_ordered_io().
+ */
+ if (ret)
+ goto error;
if (cur_offset > end)
break;
}
@@ -1509,6 +1591,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
}
+ if (ret)
+ btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
return ret;
}
@@ -1693,6 +1777,14 @@ static void btrfs_set_bit_hook(struct inode *inode,
btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
+
+ if (!(state->state & EXTENT_DELALLOC_NEW) &&
+ (*bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
+ state->start;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
}
/*
@@ -1722,7 +1814,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
if (*bits & EXTENT_FIRST_DELALLOC) {
*bits &= ~EXTENT_FIRST_DELALLOC;
- } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
+ } else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
spin_lock(&inode->lock);
inode->outstanding_extents -= num_extents;
spin_unlock(&inode->lock);
@@ -1733,7 +1825,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
* don't need to call dellalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_DO_ACCOUNTING &&
+ if (*bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
@@ -1741,10 +1833,9 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
if (btrfs_is_testing(fs_info))
return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list && !(state->state & EXTENT_NORESERVE)
- && (*bits & (EXTENT_DO_ACCOUNTING |
- EXTENT_CLEAR_DATA_RESV)))
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ do_list && !(state->state & EXTENT_NORESERVE) &&
+ (*bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(
&inode->vfs_inode,
state->start, len);
@@ -1759,6 +1850,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
btrfs_del_delalloc_inode(root, inode);
spin_unlock(&inode->lock);
}
+
+ if ((state->state & EXTENT_DELALLOC_NEW) &&
+ (*bits & EXTENT_DELALLOC_NEW)) {
+ spin_lock(&inode->lock);
+ ASSERT(inode->new_delalloc_bytes >= len);
+ inode->new_delalloc_bytes -= len;
+ spin_unlock(&inode->lock);
+ }
}
/*
@@ -2791,6 +2890,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
u64 logical_len = ordered_extent->len;
bool nolock;
bool truncated = false;
+ bool range_locked = false;
+ bool clear_new_delalloc_bytes = false;
+
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ clear_new_delalloc_bytes = true;
nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
@@ -2839,6 +2945,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ range_locked = true;
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
&cached_state);
@@ -2864,7 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_unlock;
+ goto out;
}
trans->block_rsv = &fs_info->delalloc_block_rsv;
@@ -2896,7 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- goto out_unlock;
+ goto out;
}
add_pending_csums(trans, inode, &ordered_extent->list);
@@ -2905,14 +3012,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
- goto out_unlock;
+ goto out;
}
ret = 0;
-out_unlock:
- unlock_extent_cached(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1, &cached_state, GFP_NOFS);
out:
+ if (range_locked || clear_new_delalloc_bytes) {
+ unsigned int clear_bits = 0;
+
+ if (range_locked)
+ clear_bits |= EXTENT_LOCKED;
+ if (clear_new_delalloc_bytes)
+ clear_bits |= EXTENT_DELALLOC_NEW;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ ordered_extent->file_offset,
+ ordered_extent->file_offset +
+ ordered_extent->len - 1,
+ clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0,
+ 0, &cached_state, GFP_NOFS);
+ }
+
if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
ordered_extent->len);
@@ -4401,9 +4520,17 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
+
+ trace_btrfs_truncate_show_fi_regular(
+ BTRFS_I(inode), leaf, fi,
+ found_key.offset);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
+
+ trace_btrfs_truncate_show_fi_inline(
+ BTRFS_I(inode), leaf, fi, path->slots[0],
+ found_key.offset);
}
item_end--;
}
@@ -4603,13 +4730,6 @@ error:
btrfs_free_path(path);
- if (err == 0) {
- /* only inline file may have last_size != new_size */
- if (new_size >= fs_info->sectorsize ||
- new_size > fs_info->max_inline)
- ASSERT(last_size == new_size);
- }
-
if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
@@ -6735,7 +6855,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
*
* This also copies inline extents directly into the page.
*/
-
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page,
size_t pg_offset, u64 start, u64 len,
@@ -6835,11 +6954,18 @@ again:
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
+
+ trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
+ extent_start);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_end = ALIGN(extent_start + size,
fs_info->sectorsize);
+
+ trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
+ path->slots[0],
+ extent_start);
}
next:
if (start >= extent_end) {
@@ -7037,19 +7163,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
if (IS_ERR(em))
return em;
- if (em) {
- /*
- * if our em maps to
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
- }
+ /*
+ * If our em maps to:
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
+ */
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ return em;
+ else
+ hole_em = em;
/* check to see if we've wrapped (len == -1 or similar) */
end = start + len;
@@ -8127,17 +8251,26 @@ static void btrfs_endio_direct_read(struct bio *bio)
bio_put(bio);
}
-static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
- const u64 offset,
- const u64 bytes,
- const int uptodate)
+static void __endio_write_update_ordered(struct inode *inode,
+ const u64 offset, const u64 bytes,
+ const bool uptodate)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
+ struct btrfs_workqueue *wq;
+ btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
int ret;
+ if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ wq = fs_info->endio_freespace_worker;
+ func = btrfs_freespace_write_helper;
+ } else {
+ wq = fs_info->endio_write_workers;
+ func = btrfs_endio_write_helper;
+ }
+
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
@@ -8146,9 +8279,8 @@ again:
if (!ret)
goto out_test;
- btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
- finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(fs_info->endio_write_workers, &ordered->work);
+ btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(wq, &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -8166,10 +8298,8 @@ static void btrfs_endio_direct_write(struct bio *bio)
struct btrfs_dio_private *dip = bio->bi_private;
struct bio *dio_bio = dip->dio_bio;
- btrfs_endio_direct_write_update_ordered(dip->inode,
- dip->logical_offset,
- dip->bytes,
- !bio->bi_error);
+ __endio_write_update_ordered(dip->inode, dip->logical_offset,
+ dip->bytes, !bio->bi_error);
kfree(dip);
@@ -8530,10 +8660,10 @@ free_ordered:
io_bio = NULL;
} else {
if (write)
- btrfs_endio_direct_write_update_ordered(inode,
+ __endio_write_update_ordered(inode,
file_offset,
dio_bio->bi_iter.bi_size,
- 0);
+ false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
@@ -8668,11 +8798,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
if (dio_data.unsubmitted_oe_range_start <
dio_data.unsubmitted_oe_range_end)
- btrfs_endio_direct_write_update_ordered(inode,
+ __endio_write_update_ordered(inode,
dio_data.unsubmitted_oe_range_start,
dio_data.unsubmitted_oe_range_end -
dio_data.unsubmitted_oe_range_start,
- 0);
+ false);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
@@ -8819,6 +8949,7 @@ again:
if (!inode_evicting)
clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DELALLOC_NEW |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state,
GFP_NOFS);
@@ -8876,8 +9007,8 @@ again:
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
@@ -9248,6 +9379,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
+ ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
@@ -9313,6 +9445,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(BTRFS_I(inode)->outstanding_extents);
WARN_ON(BTRFS_I(inode)->reserved_extents);
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
WARN_ON(BTRFS_I(inode)->csum_bytes);
WARN_ON(BTRFS_I(inode)->defrag_bytes);
@@ -9436,7 +9569,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
- delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+ delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dabfc7ac48a6..e176375f374f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1504,7 +1504,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1619,7 +1619,7 @@ out_free:
kfree(vol_args);
out:
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
}
@@ -2661,7 +2661,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1))
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
mutex_lock(&fs_info->volume_mutex);
@@ -2680,7 +2680,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
@@ -2708,7 +2708,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
return -EOPNOTSUPP;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -2721,7 +2721,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
mutex_unlock(&fs_info->volume_mutex);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -2752,7 +2752,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -2772,7 +2772,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out_drop_write:
mnt_drop_write_file(file);
@@ -3539,12 +3539,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
- if (!buf) {
- buf = vmalloc(fs_info->nodesize);
- if (!buf)
- return ret;
- }
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
path = btrfs_alloc_path();
if (!path) {
@@ -4442,13 +4439,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (atomic_xchg(
- &fs_info->mutually_exclusive_operation_running, 1)) {
+ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- atomic_set(
- &fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -4643,7 +4638,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
@@ -4689,7 +4684,7 @@ again:
}
locked:
- BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+ BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4745,11 +4740,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and mutually_exclusive_operation_running
+ * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
* goes to to btrfs_balance. bctl is freed in __cancel_balance,
* or, if restriper was paused all the way until unmount, in
- * free_fs_info. mutually_exclusive_operation_running is
- * cleared in __cancel_balance.
+ * free_fs_info. The flag is cleared in __cancel_balance.
*/
need_unlock = false;
@@ -4769,7 +4763,7 @@ out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
if (need_unlock)
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
mnt_drop_write_file(file);
return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9a46878ba60f..7b40e2e7292a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
/* one ref for the tree */
- atomic_set(&entry->refs, 1);
+ refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
@@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
out:
if (!ret && cached && entry) {
*cached = entry;
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
@@ -425,7 +425,7 @@ have_entry:
out:
if (!ret && cached && entry) {
*cached = entry;
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return ret == 0;
@@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode,
if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
continue;
list_add(&ordered->log_list, logged_list);
- atomic_inc(&ordered->refs);
+ refcount_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
@@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
- if (atomic_dec_and_test(&entry->refs)) {
+ if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->log_list));
ASSERT(list_empty(&entry->trans_list));
ASSERT(list_empty(&entry->root_extent_list));
@@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
- atomic_inc(&trans->use_count);
+ refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ASSERT(trans);
@@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
list_move_tail(&ordered->root_extent_list,
&root->ordered_extents);
- atomic_inc(&ordered->refs);
+ refcount_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
@@ -870,7 +870,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
if (!offset_in_entry(entry, file_offset))
entry = NULL;
if (entry)
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
@@ -911,7 +911,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
}
out:
if (entry)
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -948,7 +948,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- atomic_inc(&entry->refs);
+ refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 195c93b67fe0..e0c1d5b8d859 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -113,7 +113,7 @@ struct btrfs_ordered_extent {
int compress_type;
/* reference count */
- atomic_t refs;
+ refcount_t refs;
/* the inode we belong to */
struct inode *inode;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a59801dc2a34..deffbeb74a0b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -47,50 +47,6 @@
* - check all ioctl parameters
*/
-/*
- * one struct for each qgroup, organized in fs_info->qgroup_tree.
- */
-struct btrfs_qgroup {
- u64 qgroupid;
-
- /*
- * state
- */
- u64 rfer; /* referenced */
- u64 rfer_cmpr; /* referenced compressed */
- u64 excl; /* exclusive */
- u64 excl_cmpr; /* exclusive compressed */
-
- /*
- * limits
- */
- u64 lim_flags; /* which limits are set */
- u64 max_rfer;
- u64 max_excl;
- u64 rsv_rfer;
- u64 rsv_excl;
-
- /*
- * reservation tracking
- */
- u64 reserved;
-
- /*
- * lists
- */
- struct list_head groups; /* groups this group is member of */
- struct list_head members; /* groups that are members of this group */
- struct list_head dirty; /* dirty groups */
- struct rb_node node; /* tree of qgroups */
-
- /*
- * temp variables for accounting operations
- * Refer to qgroup_shared_accounting() for details.
- */
- u64 old_refcnt;
- u64 new_refcnt;
-};
-
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
@@ -1042,9 +998,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup,
u64 num_bytes)
{
- btrfs_warn(fs_info,
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_ON(qgroup->reserved < num_bytes);
+ btrfs_debug(fs_info,
"qgroup %llu reserved space underflow, have: %llu, to free: %llu",
qgroup->qgroupid, qgroup->reserved, num_bytes);
+#endif
qgroup->reserved = 0;
}
/*
@@ -1075,7 +1034,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
if (sign > 0) {
- if (WARN_ON(qgroup->reserved < num_bytes))
+ trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
+ if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup, num_bytes);
else
qgroup->reserved -= num_bytes;
@@ -1100,7 +1060,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
if (sign > 0) {
- if (WARN_ON(qgroup->reserved < num_bytes))
+ trace_qgroup_update_reserve(fs_info, qgroup,
+ -(s64)num_bytes);
+ if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup,
num_bytes);
else
@@ -2055,12 +2017,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
if (!ret) {
/*
- * Use (u64)-1 as time_seq to do special search, which
+ * Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
* root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, (u64)-1, &new_roots);
+ record->bytenr, SEQ_LAST, &new_roots);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip)
@@ -2367,6 +2329,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
+ int retried = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2375,7 +2338,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
if (num_bytes == 0)
return 0;
-
+retry:
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -2402,6 +2365,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ /*
+ * Commit the tree and retry, since we may have
+ * deletions which would free up space.
+ */
+ if (!retried && qg->reserved > 0) {
+ struct btrfs_trans_handle *trans;
+
+ spin_unlock(&fs_info->qgroup_lock);
+ ret = btrfs_start_delalloc_inodes(root, 0);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+ retried++;
+ goto retry;
+ }
ret = -EDQUOT;
goto out;
}
@@ -2424,6 +2408,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
qg = unode_aux_to_qgroup(unode);
+ trace_qgroup_update_reserve(fs_info, qg, num_bytes);
qg->reserved += num_bytes;
}
@@ -2469,7 +2454,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- if (WARN_ON(qg->reserved < num_bytes))
+ trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
+ if (qg->reserved < num_bytes)
report_reserved_underflow(fs_info, qg, num_bytes);
else
qg->reserved -= num_bytes;
@@ -2487,18 +2473,6 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
-{
- if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
- return;
- btrfs_err(trans->fs_info,
- "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x",
- trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
- (u32)(trans->delayed_ref_elem.seq >> 32),
- (u32)trans->delayed_ref_elem.seq);
- BUG();
-}
-
/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
@@ -2886,14 +2860,14 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
if (ret < 0)
goto out;
- if (free) {
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ if (free)
trace_op = QGROUP_FREE;
- }
trace_btrfs_qgroup_release_data(inode, start, len,
changeset.bytes_changed, trace_op);
+ if (free)
+ btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
+ BTRFS_I(inode)->root->objectid,
+ changeset.bytes_changed);
out:
ulist_release(&changeset.range_changed);
return ret;
@@ -2945,6 +2919,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
+ trace_qgroup_meta_reserve(root, (s64)num_bytes);
ret = qgroup_reserve(root, num_bytes, enforce);
if (ret < 0)
return ret;
@@ -2964,6 +2939,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
if (reserved == 0)
return;
+ trace_qgroup_meta_reserve(root, -(s64)reserved);
btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
}
@@ -2978,6 +2954,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
+ trace_qgroup_meta_reserve(root, -(s64)num_bytes);
btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 26932a8a1993..fe04d3f295c6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -62,6 +62,50 @@ struct btrfs_qgroup_extent_record {
};
/*
+ * one struct for each qgroup, organized in fs_info->qgroup_tree.
+ */
+struct btrfs_qgroup {
+ u64 qgroupid;
+
+ /*
+ * state
+ */
+ u64 rfer; /* referenced */
+ u64 rfer_cmpr; /* referenced compressed */
+ u64 excl; /* exclusive */
+ u64 excl_cmpr; /* exclusive compressed */
+
+ /*
+ * limits
+ */
+ u64 lim_flags; /* which limits are set */
+ u64 max_rfer;
+ u64 max_excl;
+ u64 rsv_rfer;
+ u64 rsv_excl;
+
+ /*
+ * reservation tracking
+ */
+ u64 reserved;
+
+ /*
+ * lists
+ */
+ struct list_head groups; /* groups this group is member of */
+ struct list_head members; /* groups that are members of this group */
+ struct list_head dirty; /* dirty groups */
+ struct rb_node node; /* tree of qgroups */
+
+ /*
+ * temp variables for accounting operations
+ * Refer to qgroup_shared_accounting() for details.
+ */
+ u64 old_refcnt;
+ u64 new_refcnt;
+};
+
+/*
* For qgroup event trace points only
*/
#define QGROUP_RESERVE (1<<0)
@@ -186,17 +230,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes);
-/*
- * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
- * called by everywhere, can't provide good trace for delayed ref case.
- */
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
}
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1571bf26dc07..d8ea0eb76325 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -149,7 +149,7 @@ struct btrfs_raid_bio {
int generic_bio_cnt;
- atomic_t refs;
+ refcount_t refs;
atomic_t stripes_pending;
@@ -389,7 +389,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
if (bio_list_empty(&rbio->bio_list)) {
if (!list_empty(&rbio->hash_list)) {
list_del_init(&rbio->hash_list);
- atomic_dec(&rbio->refs);
+ refcount_dec(&rbio->refs);
BUG_ON(!list_empty(&rbio->plug_list));
}
}
@@ -480,7 +480,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
/* bump our ref if we were not in the list before */
if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
- atomic_inc(&rbio->refs);
+ refcount_inc(&rbio->refs);
if (!list_empty(&rbio->stripe_cache)){
list_move(&rbio->stripe_cache, &table->stripe_cache);
@@ -689,7 +689,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
test_bit(RBIO_CACHE_BIT, &cur->flags) &&
!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
list_del_init(&cur->hash_list);
- atomic_dec(&cur->refs);
+ refcount_dec(&cur->refs);
steal_rbio(cur, rbio);
cache_drop = cur;
@@ -738,7 +738,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
}
}
lockit:
- atomic_inc(&rbio->refs);
+ refcount_inc(&rbio->refs);
list_add(&rbio->hash_list, &h->hash_list);
out:
spin_unlock_irqrestore(&h->lock, flags);
@@ -784,7 +784,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
list_del_init(&rbio->hash_list);
- atomic_dec(&rbio->refs);
+ refcount_dec(&rbio->refs);
/*
* we use the plug list to hold all the rbios
@@ -801,7 +801,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
list_del_init(&rbio->plug_list);
list_add(&next->hash_list, &h->hash_list);
- atomic_inc(&next->refs);
+ refcount_inc(&next->refs);
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
@@ -843,8 +843,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
- WARN_ON(atomic_read(&rbio->refs) < 0);
- if (!atomic_dec_and_test(&rbio->refs))
+ if (!refcount_dec_and_test(&rbio->refs))
return;
WARN_ON(!list_empty(&rbio->stripe_cache));
@@ -997,7 +996,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
rbio->stripe_npages = stripe_npages;
rbio->faila = -1;
rbio->failb = -1;
- atomic_set(&rbio->refs, 1);
+ refcount_set(&rbio->refs, 1);
atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
@@ -2118,6 +2117,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_raid_bio *rbio;
int ret;
+ if (generic_io) {
+ ASSERT(bbio->mirror_num == mirror_num);
+ btrfs_io_bio(bio)->mirror_num = mirror_num;
+ }
+
rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
@@ -2194,6 +2198,8 @@ static void read_rebuild_work(struct btrfs_work *work)
/*
* The following code is used to scrub/replace the parity stripe
*
+ * Caller must have already increased bio_counter for getting @bbio.
+ *
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
@@ -2231,6 +2237,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
+ /*
+ * We have already increased bio_counter when getting bbio, record it
+ * so we can free it at rbio_orig_end_io().
+ */
+ rbio->generic_bio_cnt = 1;
+
return rbio;
}
@@ -2673,6 +2685,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
return NULL;
}
+ /*
+ * When we get bbio, we have already increased bio_counter, record it
+ * so we can free it at rbio_orig_end_io()
+ */
+ rbio->generic_bio_cnt = 1;
+
return rbio;
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index e88bca87f5d2..a17e775a4a89 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -209,9 +209,9 @@ cleanup:
return;
}
-int btree_readahead_hook(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int err)
+int btree_readahead_hook(struct extent_buffer *eb, int err)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
struct reada_extent *re;
@@ -235,10 +235,10 @@ start_machine:
return ret;
}
-static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev, u64 logical,
+static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
struct btrfs_bio *bbio)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
struct reada_zone *zone;
struct btrfs_block_group_cache *cache = NULL;
@@ -270,6 +270,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
if (!zone)
return NULL;
+ ret = radix_tree_preload(GFP_KERNEL);
+ if (ret) {
+ kfree(zone);
+ return NULL;
+ }
+
zone->start = start;
zone->end = end;
INIT_LIST_HEAD(&zone->list);
@@ -299,6 +305,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
zone = NULL;
}
spin_unlock(&fs_info->reada_lock);
+ radix_tree_preload_end();
return zone;
}
@@ -313,7 +320,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
struct btrfs_bio *bbio = NULL;
struct btrfs_device *dev;
struct btrfs_device *prev_dev;
- u32 blocksize;
u64 length;
int real_stripes;
int nzones = 0;
@@ -334,7 +340,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!re)
return NULL;
- blocksize = fs_info->nodesize;
re->logical = logical;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
@@ -344,10 +349,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
/*
* map block
*/
- length = blocksize;
+ length = fs_info->nodesize;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&length, &bbio, 0);
- if (ret || !bbio || length < blocksize)
+ if (ret || !bbio || length < fs_info->nodesize)
goto error;
if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
@@ -367,7 +372,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
if (!dev->bdev)
continue;
- zone = reada_find_zone(fs_info, dev, logical, bbio);
+ zone = reada_find_zone(dev, logical, bbio);
if (!zone)
continue;
@@ -386,6 +391,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
}
+ ret = radix_tree_preload(GFP_KERNEL);
+ if (ret)
+ goto error;
+
/* insert extent in reada_tree + all per-device trees, all or nothing */
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
spin_lock(&fs_info->reada_lock);
@@ -395,13 +404,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ radix_tree_preload_end();
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ radix_tree_preload_end();
goto error;
}
+ radix_tree_preload_end();
prev_dev = NULL;
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
&fs_info->dev_replace);
@@ -639,9 +651,9 @@ static int reada_pick_zone(struct btrfs_device *dev)
return 1;
}
-static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
- struct btrfs_device *dev)
+static int reada_start_machine_dev(struct btrfs_device *dev)
{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
struct reada_extent *re = NULL;
int mirror_num = 0;
struct extent_buffer *eb = NULL;
@@ -754,8 +766,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
- enqueued += reada_start_machine_dev(fs_info,
- device);
+ enqueued += reada_start_machine_dev(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a08224eab8b4..7d6bc308bf43 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -501,8 +501,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_root_item *item = &root->root_item;
- struct timespec ct = current_fs_time(root->fs_info->sb);
+ struct timespec ct;
+ ktime_get_real_ts(&ct);
spin_lock(&root->root_item_lock);
btrfs_set_root_ctransid(item, trans->transid);
btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b0251eb1239f..c7b45eb2403d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -64,7 +64,7 @@ struct scrub_ctx;
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
- atomic_t refs;
+ refcount_t refs;
struct btrfs_bio *bbio;
u64 map_length;
};
@@ -112,7 +112,7 @@ struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
- atomic_t refs; /* free mem on transition to zero */
+ refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
@@ -140,9 +140,9 @@ struct scrub_parity {
int nsectors;
- int stripe_len;
+ u64 stripe_len;
- atomic_t refs;
+ refcount_t refs;
struct list_head spages;
@@ -202,7 +202,7 @@ struct scrub_ctx {
* doesn't free the scrub context before or while the workers are
* doing the wakeup() call.
*/
- atomic_t refs;
+ refcount_t refs;
};
struct scrub_fixup_nodatasum {
@@ -240,6 +240,13 @@ struct scrub_warning {
struct btrfs_device *dev;
};
+struct full_stripe_lock {
+ struct rb_node node;
+ u64 logical;
+ u64 refs;
+ struct mutex mutex;
+};
+
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -305,7 +312,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
- atomic_inc(&sctx->refs);
+ refcount_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
@@ -349,6 +356,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
}
/*
+ * Insert new full stripe lock into full stripe locks tree
+ *
+ * Return pointer to existing or newly inserted full_stripe_lock structure if
+ * everything works well.
+ * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+ *
+ * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+ * function
+ */
+static struct full_stripe_lock *insert_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct full_stripe_lock *entry;
+ struct full_stripe_lock *ret;
+
+ WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+ p = &locks_root->root.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical) {
+ p = &(*p)->rb_left;
+ } else if (fstripe_logical > entry->logical) {
+ p = &(*p)->rb_right;
+ } else {
+ entry->refs++;
+ return entry;
+ }
+ }
+
+ /* Insert new lock */
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+ ret->logical = fstripe_logical;
+ ret->refs = 1;
+ mutex_init(&ret->mutex);
+
+ rb_link_node(&ret->node, parent, p);
+ rb_insert_color(&ret->node, &locks_root->root);
+ return ret;
+}
+
+/*
+ * Search for a full stripe lock of a block group
+ *
+ * Return pointer to existing full stripe lock if found
+ * Return NULL if not found
+ */
+static struct full_stripe_lock *search_full_stripe_lock(
+ struct btrfs_full_stripe_locks_tree *locks_root,
+ u64 fstripe_logical)
+{
+ struct rb_node *node;
+ struct full_stripe_lock *entry;
+
+ WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+ node = locks_root->root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct full_stripe_lock, node);
+ if (fstripe_logical < entry->logical)
+ node = node->rb_left;
+ else if (fstripe_logical > entry->logical)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ *
+ * Caller must ensure @cache is a RAID56 block group.
+ */
+static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
+ u64 bytenr)
+{
+ u64 ret;
+
+ /*
+ * Due to chunk item size limit, full stripe length should not be
+ * larger than U32_MAX. Just a sanity check here.
+ */
+ WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+
+ /*
+ * round_down() can only handle power of 2, while RAID56 full
+ * stripe length can be 64KiB * n, so we need to manually round down.
+ */
+ ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->key.objectid;
+ return ret;
+}
+
+/*
+ * Lock a full stripe to avoid concurrency of recovery and read
+ *
+ * It's only used for profiles with parities (RAID5/6), for other profiles it
+ * does nothing.
+ *
+ * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+ * So caller must call unlock_full_stripe() at the same context.
+ *
+ * Return <0 if encounters error.
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool *locked_ret)
+{
+ struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *existing;
+ u64 fstripe_start;
+ int ret = 0;
+
+ *locked_ret = false;
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+
+ /* Profiles not based on parity don't need full stripe lock */
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+ locks_root = &bg_cache->full_stripe_locks_root;
+
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ /* Now insert the full stripe lock */
+ mutex_lock(&locks_root->lock);
+ existing = insert_full_stripe_lock(locks_root, fstripe_start);
+ mutex_unlock(&locks_root->lock);
+ if (IS_ERR(existing)) {
+ ret = PTR_ERR(existing);
+ goto out;
+ }
+ mutex_lock(&existing->mutex);
+ *locked_ret = true;
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
+ * Unlock a full stripe.
+ *
+ * NOTE: Caller must ensure it's the same context calling corresponding
+ * lock_full_stripe().
+ *
+ * Return 0 if we unlock full stripe without problem.
+ * Return <0 for error
+ */
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+ bool locked)
+{
+ struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_full_stripe_locks_tree *locks_root;
+ struct full_stripe_lock *fstripe_lock;
+ u64 fstripe_start;
+ bool freeit = false;
+ int ret = 0;
+
+ /* If we didn't acquire full stripe lock, no need to continue */
+ if (!locked)
+ return 0;
+
+ bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg_cache) {
+ ASSERT(0);
+ return -ENOENT;
+ }
+ if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ goto out;
+
+ locks_root = &bg_cache->full_stripe_locks_root;
+ fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+ mutex_lock(&locks_root->lock);
+ fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+ /* Unpaired unlock_full_stripe() detected */
+ if (!fstripe_lock) {
+ WARN_ON(1);
+ ret = -ENOENT;
+ mutex_unlock(&locks_root->lock);
+ goto out;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ WARN_ON(1);
+ btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+ fstripe_lock->logical);
+ } else {
+ fstripe_lock->refs--;
+ }
+
+ if (fstripe_lock->refs == 0) {
+ rb_erase(&fstripe_lock->node, &locks_root->root);
+ freeit = true;
+ }
+ mutex_unlock(&locks_root->lock);
+
+ mutex_unlock(&fstripe_lock->mutex);
+ if (freeit)
+ kfree(fstripe_lock);
+out:
+ btrfs_put_block_group(bg_cache);
+ return ret;
+}
+
+/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
*/
@@ -356,7 +579,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- atomic_inc(&sctx->refs);
+ refcount_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
@@ -447,7 +670,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
static void scrub_put_ctx(struct scrub_ctx *sctx)
{
- if (atomic_dec_and_test(&sctx->refs))
+ if (refcount_dec_and_test(&sctx->refs))
scrub_free_ctx(sctx);
}
@@ -462,7 +685,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
- atomic_set(&sctx->refs, 1);
+ refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
@@ -857,12 +1080,14 @@ out:
static inline void scrub_get_recover(struct scrub_recover *recover)
{
- atomic_inc(&recover->refs);
+ refcount_inc(&recover->refs);
}
-static inline void scrub_put_recover(struct scrub_recover *recover)
+static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+ struct scrub_recover *recover)
{
- if (atomic_dec_and_test(&recover->refs)) {
+ if (refcount_dec_and_test(&recover->refs)) {
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(recover->bbio);
kfree(recover);
}
@@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int mirror_index;
int page_num;
int success;
+ bool full_stripe_locked;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
+ /*
+ * For RAID5/6, race can happen for a different device scrub thread.
+ * For data corruption, Parity and Data threads will both try
+ * to recovery the data.
+ * Race can lead to doubly added csum error, or even unrecoverable
+ * error.
+ */
+ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+ if (ret < 0) {
+ spin_lock(&sctx->stat_lock);
+ if (ret == -ENOMEM)
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return ret;
+ }
+
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
sblocks_for_recheck = NULL;
goto nodatasum_case;
@@ -1241,7 +1485,7 @@ out:
sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
sblock->pagev[page_index]->recover =
NULL;
}
@@ -1251,6 +1495,9 @@ out:
kfree(sblocks_for_recheck);
}
+ ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+ if (ret < 0)
+ return ret;
return 0;
}
@@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bbio, 0, 1);
+ logical, &mapped_length, &bbio);
if (ret || !bbio || mapped_length < sublen) {
btrfs_put_bbio(bbio);
+ btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
btrfs_put_bbio(bbio);
+ btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
- atomic_set(&recover->refs, 1);
+ refcount_set(&recover->refs, 1);
recover->bbio = bbio;
recover->map_length = mapped_length;
@@ -1365,7 +1615,7 @@ leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
scrub_page_get(page);
@@ -1407,7 +1657,7 @@ leave_nomem:
scrub_get_recover(recover);
page->recover = recover;
}
- scrub_put_recover(recover);
+ scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
page_index++;
@@ -1497,14 +1747,18 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
- if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
+ page->io_error = 1;
sblock->no_io_error_seen = 0;
+ }
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- if (btrfsic_submit_bio_wait(bio))
+ if (btrfsic_submit_bio_wait(bio)) {
+ page->io_error = 1;
sblock->no_io_error_seen = 0;
+ }
}
bio_put(bio);
@@ -1634,7 +1888,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
if (spage->io_error) {
void *mapped_buffer = kmap_atomic(spage->page);
- memset(mapped_buffer, 0, PAGE_SIZE);
+ clear_page(mapped_buffer);
flush_dcache_page(spage->page);
kunmap_atomic(mapped_buffer);
}
@@ -1998,12 +2252,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
static void scrub_block_get(struct scrub_block *sblock)
{
- atomic_inc(&sblock->refs);
+ refcount_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
- if (atomic_dec_and_test(&sblock->refs)) {
+ if (refcount_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
@@ -2187,8 +2441,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
int ret;
int i;
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bbio, 0, 1);
+ &length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2231,6 +2486,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
rbio_out:
bio_put(bio);
bbio_out:
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2255,7 +2511,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->refs, 1);
+ refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
@@ -2385,7 +2641,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u64 len)
{
- u32 offset;
+ u64 offset;
int nsectors;
int sectorsize = sparity->sctx->fs_info->sectorsize;
@@ -2395,8 +2651,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
start -= sparity->logic_start;
- start = div_u64_rem(start, sparity->stripe_len, &offset);
- offset /= sectorsize;
+ start = div64_u64_rem(start, sparity->stripe_len, &offset);
+ offset = div_u64(offset, sectorsize);
nsectors = (int)len / sectorsize;
if (offset + nsectors <= sparity->nsectors) {
@@ -2555,7 +2811,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->refs, 1);
+ refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
@@ -2694,7 +2950,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
- stripe_nr = div_u64(*offset, map->stripe_len);
+ stripe_nr = div64_u64(*offset, map->stripe_len);
stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
/* Work out the disk rotation on this stripe-set */
@@ -2765,7 +3021,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
- struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
u64 length;
int ret;
@@ -2775,8 +3030,10 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto out;
length = sparity->logic_end - sparity->logic_start;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bbio, 0, 1);
+ &length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
@@ -2795,9 +3052,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (!rbio)
goto rbio_out;
- list_for_each_entry(spage, &sparity->spages, list)
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
-
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
return;
@@ -2805,6 +3059,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
bbio_out:
+ btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
@@ -2822,12 +3077,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
static void scrub_parity_get(struct scrub_parity *sparity)
{
- atomic_inc(&sparity->refs);
+ refcount_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
- if (!atomic_dec_and_test(&sparity->refs))
+ if (!refcount_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
@@ -2879,7 +3134,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
- atomic_set(&sparity->refs, 1);
+ refcount_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3098,7 +3353,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
physical = map->stripes[num].physical;
offset = 0;
- nstripes = div_u64(length, map->stripe_len);
+ nstripes = div64_u64(length, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a60d5bfb8a49..fc496a6f842a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5184,13 +5184,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
while (key.offset < ekey->offset + left_len) {
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
- if (right_type != BTRFS_FILE_EXTENT_REG) {
+ if (right_type != BTRFS_FILE_EXTENT_REG &&
+ right_type != BTRFS_FILE_EXTENT_INLINE) {
ret = 0;
goto out;
}
right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
- right_len = btrfs_file_extent_num_bytes(eb, ei);
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ right_len = btrfs_file_extent_inline_len(eb, slot, ei);
+ right_len = PAGE_ALIGN(right_len);
+ } else {
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ }
right_offset = btrfs_file_extent_offset(eb, ei);
right_gen = btrfs_file_extent_generation(eb, ei);
@@ -5204,6 +5210,19 @@ static int is_extent_unchanged(struct send_ctx *sctx,
goto out;
}
+ /*
+ * We just wanted to see if when we have an inline extent, what
+ * follows it is a regular extent (wanted to check the above
+ * condition for inline extents too). This should normally not
+ * happen but it's possible for example when we have an inline
+ * compressed extent representing data with a size matching
+ * the page size (currently the same as sector size).
+ */
+ if (right_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = 0;
+ goto out;
+ }
+
left_offset_fixed = left_offset;
if (key.offset < ekey->offset) {
/* Fix the right offset for 2a and 7. */
@@ -6360,22 +6379,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
if (!sctx->send_buf) {
- sctx->send_buf = vmalloc(sctx->send_max_size);
- if (!sctx->send_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
- sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
+ sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
if (!sctx->read_buf) {
- sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
sctx->pending_dir_moves = RB_ROOT;
@@ -6396,13 +6409,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
if (arg->clone_sources_count) {
- clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
+ clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
if (!clone_sources_tmp) {
- clone_sources_tmp = vmalloc(alloc_size);
- if (!clone_sources_tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9530a333d302..4f1cdd5058f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1136,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_flags |= MS_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
+
+ err = super_setup_bdi(sb);
+ if (err) {
+ btrfs_err(fs_info, "super_setup_bdi failed");
+ return err;
+ }
+
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
@@ -1788,8 +1795,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (fs_info->fs_devices->missing_devices >
- fs_info->num_tolerated_disk_barrier_failures &&
- !(*flags & MS_RDONLY)) {
+ fs_info->num_tolerated_disk_barrier_failures) {
btrfs_warn(fs_info,
"too many missing devices, writeable remount is not allowed");
ret = -EACCES;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ea272432c930..b18ab8f327a5 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
{
memset(trans, 0, sizeof(*trans));
trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
trans->type = __TRANS_DUMMY;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 61b807de3e16..2168654c90a1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
- WARN_ON(atomic_read(&transaction->use_count) == 0);
- if (atomic_dec_and_test(&transaction->use_count)) {
+ WARN_ON(refcount_read(&transaction->use_count) == 0);
+ if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
if (transaction->delayed_refs.pending_csums)
@@ -207,7 +207,7 @@ loop:
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
@@ -257,7 +257,7 @@ loop:
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
- atomic_set(&cur_trans->use_count, 2);
+ refcount_set(&cur_trans->use_count, 2);
atomic_set(&cur_trans->pending_ordered, 0);
cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
@@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->trans_lock);
cur_trans = fs_info->running_transaction;
if (cur_trans && is_transaction_blocked(cur_trans)) {
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_event(fs_info->transaction_wait,
@@ -572,7 +572,6 @@ again:
h->type = type;
h->can_flush_pending_bgs = true;
- INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
@@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
list_for_each_entry(t, &fs_info->trans_list, list) {
if (t->transid == transid) {
cur_trans = t;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
ret = 0;
break;
}
@@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
break;
}
}
@@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up_process(info->transaction_kthread);
err = -EIO;
}
- assert_qgroups_uptodate(trans);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
if (must_run_delayed_refs) {
@@ -1839,7 +1837,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
/* take transaction reference */
cur_trans = trans->transaction;
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
btrfs_end_transaction(trans);
@@ -2015,7 +2013,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&fs_info->trans_lock);
- atomic_inc(&cur_trans->use_count);
+ refcount_inc(&cur_trans->use_count);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans);
@@ -2035,7 +2033,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (prev_trans->state != TRANS_STATE_COMPLETED) {
- atomic_inc(&prev_trans->use_count);
+ refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans);
@@ -2130,13 +2128,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
- /* Reocrd old roots for later qgroup accounting */
- ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
-
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -2179,6 +2170,24 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_free_log_root_tree(trans, fs_info);
/*
+ * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
+ * new delayed refs. Must handle them or qgroup can be wrong.
+ */
+ ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
+ /*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
@@ -2223,7 +2232,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
switch_commit_roots(cur_trans, fs_info);
- assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
update_super_roots(fs_info);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5dfb5590fff6..c55e44560103 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -18,6 +18,8 @@
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
+
+#include <linux/refcount.h>
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
@@ -49,7 +51,7 @@ struct btrfs_transaction {
* transaction can end
*/
atomic_t num_writers;
- atomic_t use_count;
+ refcount_t use_count;
atomic_t pending_ordered;
unsigned long flags;
@@ -125,8 +127,6 @@ struct btrfs_trans_handle {
unsigned int type;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
- struct seq_list delayed_ref_elem;
- struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a59674c3e69e..ccfe9fe7754a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4196,7 +4196,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
if (em->generation <= test_gen)
continue;
/* Need a ref to keep it from getting evicted from cache */
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
list_add_tail(&em->list, &extents);
num++;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ab8a66d852f9..017b67daa3bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
+ enum btrfs_map_op op,
+ u64 logical, u64 *length,
+ struct btrfs_bio **bbio_ret,
+ int mirror_num, int need_raid_map);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -1008,14 +1013,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
+ if (!blk_queue_nonrot(q))
+ fs_devices->rotating = 1;
device->bdev = bdev;
device->in_fs_metadata = 0;
device->mode = flags;
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
- fs_devices->rotating = 1;
-
fs_devices->open_devices++;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -2417,7 +2421,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&fs_info->free_chunk_lock);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
tmp = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2795,10 +2799,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
return ret;
}
+static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ logical, length);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (em->start > logical || em->start + em->len < logical) {
+ btrfs_crit(fs_info,
+ "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
+ logical, length, em->start, em->start + em->len);
+ free_extent_map(em);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* callers are responsible for dropping em's ref. */
+ return em;
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
@@ -2806,23 +2838,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em_tree = &fs_info->mapping_tree.map_tree;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em || em->start > chunk_offset ||
- em->start + em->len < chunk_offset) {
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- if (em)
- free_extent_map(em);
- return -EINVAL;
+ return PTR_ERR(em);
}
map = em->map_lookup;
mutex_lock(&fs_info->chunk_mutex);
@@ -3736,7 +3760,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
/* Non-zero return value signifies invalidity */
@@ -3755,6 +3779,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
int ret;
@@ -3851,11 +3876,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
- btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ /* if we're not converting, the target field is uninitialized */
+ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+ data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+ bctl->data.target : fs_info->avail_data_alloc_bits;
+ if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
btrfs_warn(fs_info,
"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
- bctl->meta.target, bctl->data.target);
+ meta_target, data_target);
}
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
@@ -3910,7 +3940,7 @@ out:
__cancel_balance(fs_info);
else {
kfree(bctl);
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
return ret;
}
@@ -4000,7 +4030,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+ WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -4785,7 +4815,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- stripe_size = div_u64(stripe_size, raid_stripe_len);
+ stripe_size = div64_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4833,7 +4863,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = add_extent_mapping(em_tree, em, 0);
if (!ret) {
list_add_tail(&em->list, &trans->transaction->pending_chunks);
- atomic_inc(&em->refs);
+ refcount_inc(&em->refs);
}
write_unlock(&em_tree->lock);
if (ret) {
@@ -4888,7 +4918,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
@@ -4897,24 +4926,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int i = 0;
int ret = 0;
- em_tree = &fs_info->mapping_tree.map_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %Lu len %Lu",
- chunk_offset, chunk_size);
- return -EINVAL;
- }
-
- if (em->start != chunk_offset || em->len != chunk_size) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu",
- chunk_offset, chunk_size, em->start, em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
@@ -5055,15 +5069,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int readonly = 0;
int miss_ndevs = 0;
int i;
- read_lock(&map_tree->map_tree.lock);
- em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- read_unlock(&map_tree->map_tree.lock);
- if (!em)
+ em = get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(em))
return 1;
map = em->map_lookup;
@@ -5117,34 +5128,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
-
- /*
- * We could return errors for these cases, but that could get ugly and
- * we'd probably do the same thing which is just not do anything else
- * and exit, so return 1 so the callers don't try to use other copies.
- */
- if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
- logical+len);
- return 1;
- }
-
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu",
- logical, logical+len, em->start,
- em->start + em->len);
- free_extent_map(em);
+ em = get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(em))
+ /*
+ * We could return errors for these cases, but that could get
+ * ugly and we'd probably do the same thing which is just not do
+ * anything else and exit, so return 1 so the callers don't try
+ * to use other copies.
+ */
return 1;
- }
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
@@ -5160,7 +5156,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
free_extent_map(em);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+ fs_info->dev_replace.tgtdev)
ret++;
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -5173,15 +5170,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
unsigned long len = fs_info->sectorsize;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
@@ -5189,20 +5182,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, len);
- read_unlock(&em_tree->lock);
- BUG_ON(!em);
+ em = get_chunk_map(fs_info, logical, len);
+ WARN_ON(IS_ERR(em));
- BUG_ON(em->start > logical || em->start + em->len < logical);
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
@@ -5295,25 +5284,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
- atomic_set(&bbio->refs, 1);
+ refcount_set(&bbio->refs, 1);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
- WARN_ON(!atomic_read(&bbio->refs));
- atomic_inc(&bbio->refs);
+ WARN_ON(!refcount_read(&bbio->refs));
+ refcount_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
- if (atomic_dec_and_test(&bbio->refs))
+ if (refcount_dec_and_test(&bbio->refs))
kfree(bbio);
}
+/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
+/*
+ * Please note that, discard won't be sent to target device of device
+ * replace.
+ */
+static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ struct btrfs_bio **bbio_ret)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_bio *bbio;
+ u64 offset;
+ u64 stripe_nr;
+ u64 stripe_nr_end;
+ u64 stripe_end_offset;
+ u64 stripe_cnt;
+ u64 stripe_len;
+ u64 stripe_offset;
+ u64 num_stripes;
+ u32 stripe_index;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+ u32 last_stripe = 0;
+ int ret = 0;
+ int i;
+
+ /* discard always return a bbio */
+ ASSERT(bbio_ret);
+
+ em = get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* we don't discard raid56 yet */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ offset = logical - em->start;
+ length = min_t(u64, em->len - offset, length);
+
+ stripe_len = map->stripe_len;
+ /*
+ * stripe_nr counts the total number of stripes we have to stride
+ * to get to this block
+ */
+ stripe_nr = div64_u64(offset, stripe_len);
+
+ /* stripe_offset is the offset of this block in its stripe */
+ stripe_offset = offset - stripe_nr * stripe_len;
+
+ stripe_nr_end = round_up(offset + length, map->stripe_len);
+ stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_cnt = stripe_nr_end - stripe_nr;
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + length);
+ /*
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
+ */
+ num_stripes = 1;
+ stripe_index = 0;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ num_stripes = min_t(u64, map->num_stripes,
+ sub_stripes * stripe_cnt);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index *= sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+ &remaining_stripes);
+ div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+ last_stripe *= sub_stripes;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ num_stripes = map->num_stripes;
+ } else {
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
+ }
+
+ bbio = alloc_btrfs_bio(num_stripes, 0);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+
+ /*
+ * Special for the first stripe and
+ * the last stripe:
+ *
+ * |-------|...|-------|
+ * |----------|
+ * off end_off
+ */
+ if (i < sub_stripes)
+ bbio->stripes[i].length -=
+ stripe_offset;
+
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ sub_stripes - 1))
+ bbio->stripes[i].length -=
+ stripe_end_offset;
+
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
+ } else {
+ bbio->stripes[i].length = length;
+ }
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+
+ *bbio_ret = bbio;
+ bbio->map_type = map->type;
+ bbio->num_stripes = num_stripes;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * In dev-replace case, for repair case (that's the only case where the mirror
+ * is selected explicitly when calling btrfs_map_block), blocks left of the
+ * left cursor can also be read from the target drive.
+ *
+ * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+ * array of stripes.
+ * For READ, it also needs to be supported using the same mirror number.
+ *
+ * If the requested block is not left of the left cursor, EIO is returned. This
+ * can happen because btrfs_num_copies() returns one more in the dev-replace
+ * case.
+ */
+static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length,
+ u64 srcdev_devid, int *mirror_num,
+ u64 *physical)
+{
+ struct btrfs_bio *bbio = NULL;
+ int num_stripes;
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+ int i;
+ int ret = 0;
+
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ logical, &length, &bbio, 0, 0);
+ if (ret) {
+ ASSERT(bbio == NULL);
+ return ret;
+ }
+
+ num_stripes = bbio->num_stripes;
+ if (*mirror_num > num_stripes) {
+ /*
+ * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+ * that means that the requested area is not left of the left
+ * cursor
+ */
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ /*
+ * process the rest of the function using the mirror_num of the source
+ * drive. Therefore look it up first. At the end, patch the device
+ * pointer to the one of the target drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add the
+ * mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= bbio->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+
+ btrfs_put_bbio(bbio);
+
+ ASSERT(found);
+ if (!found)
+ return -EIO;
+
+ *mirror_num = index_srcdev + 1;
+ *physical = physical_of_found;
+ return ret;
+}
+
+static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+ struct btrfs_bio **bbio_ret,
+ struct btrfs_dev_replace *dev_replace,
+ int *num_stripes_ret, int *max_errors_ret)
+{
+ struct btrfs_bio *bbio = *bbio_ret;
+ u64 srcdev_devid = dev_replace->srcdev->devid;
+ int tgtdev_indexes = 0;
+ int num_stripes = *num_stripes_ret;
+ int max_errors = *max_errors_ret;
+ int i;
+
+ if (op == BTRFS_MAP_WRITE) {
+ int index_where_to_add;
+
+ /*
+ * duplicate the write operations while the dev replace
+ * procedure is running. Since the copying of the old disk to
+ * the new disk takes place at run time while the filesystem is
+ * mounted writable, the regular write operations to the old
+ * disk have to be duplicated to go to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that
+ * the write to the old disk is already set up in the stripes
+ * array.
+ */
+ index_where_to_add = num_stripes;
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /* write to new disk, too */
+ struct btrfs_bio_stripe *new =
+ bbio->stripes + index_where_to_add;
+ struct btrfs_bio_stripe *old =
+ bbio->stripes + i;
+
+ new->physical = old->physical;
+ new->length = old->length;
+ new->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[i] = index_where_to_add;
+ index_where_to_add++;
+ max_errors++;
+ tgtdev_indexes++;
+ }
+ }
+ num_stripes = index_where_to_add;
+ } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+ int index_srcdev = 0;
+ int found = 0;
+ u64 physical_of_found = 0;
+
+ /*
+ * During the dev-replace procedure, the target drive can also
+ * be used to read data in case it is needed to repair a corrupt
+ * block elsewhere. This is possible if the requested area is
+ * left of the left cursor. In this area, the target drive is a
+ * full copy of the source drive.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ if (bbio->stripes[i].dev->devid == srcdev_devid) {
+ /*
+ * In case of DUP, in order to keep it simple,
+ * only add the mirror with the lowest physical
+ * address
+ */
+ if (found &&
+ physical_of_found <=
+ bbio->stripes[i].physical)
+ continue;
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = bbio->stripes[i].physical;
+ }
+ }
+ if (found) {
+ struct btrfs_bio_stripe *tgtdev_stripe =
+ bbio->stripes + num_stripes;
+
+ tgtdev_stripe->physical = physical_of_found;
+ tgtdev_stripe->length =
+ bbio->stripes[index_srcdev].length;
+ tgtdev_stripe->dev = dev_replace->tgtdev;
+ bbio->tgtdev_map[index_srcdev] = num_stripes;
+
+ tgtdev_indexes++;
+ num_stripes++;
+ }
+ }
+
+ *num_stripes_ret = num_stripes;
+ *max_errors_ret = max_errors;
+ bbio->num_tgtdevs = tgtdev_indexes;
+ *bbio_ret = bbio;
+}
+
+static bool need_full_stripe(enum btrfs_map_op op)
+{
+ return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
@@ -5322,14 +5639,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
- u64 stripe_end_offset;
u64 stripe_nr;
- u64 stripe_nr_orig;
- u64 stripe_nr_end;
u64 stripe_len;
u32 stripe_index;
int i;
@@ -5345,23 +5657,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, *length);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu len %llu",
- logical, *length);
- return -EINVAL;
- }
+ if (op == BTRFS_MAP_DISCARD)
+ return __btrfs_map_block_for_discard(fs_info, logical,
+ *length, bbio_ret);
- if (em->start > logical || em->start + em->len < logical) {
- btrfs_crit(fs_info,
- "found a bad mapping, wanted %Lu, found %Lu-%Lu",
- logical, em->start, em->start + em->len);
- free_extent_map(em);
- return -EINVAL;
- }
+ em = get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
map = em->map_lookup;
offset = logical - em->start;
@@ -5400,14 +5702,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
raid56_full_stripe_start *= full_stripe_len;
}
- if (op == BTRFS_MAP_DISCARD) {
- /* we don't discard raid56 yet */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- *length = min_t(u64, em->len - offset, *length);
- } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len;
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
@@ -5438,105 +5733,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) {
- /*
- * in dev-replace case, for repair case (that's the only
- * case where the mirror is selected explicitly when
- * calling btrfs_map_block), blocks left of the left cursor
- * can also be read from the target drive.
- * For REQ_GET_READ_MIRRORS, the target drive is added as
- * the last one to the array of stripes. For READ, it also
- * needs to be supported using the same mirror number.
- * If the requested block is not left of the left cursor,
- * EIO is returned. This can happen because btrfs_num_copies()
- * returns one more in the dev-replace case.
- */
- u64 tmp_length = *length;
- struct btrfs_bio *tmp_bbio = NULL;
- int tmp_num_stripes;
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, 0);
- if (ret) {
- WARN_ON(tmp_bbio != NULL);
- goto out;
- }
-
- tmp_num_stripes = tmp_bbio->num_stripes;
- if (mirror_num > tmp_num_stripes) {
- /*
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this
- * mirror, that means that the requested area
- * is not left of the left cursor
- */
- ret = -EIO;
- btrfs_put_bbio(tmp_bbio);
- goto out;
- }
-
- /*
- * process the rest of the function using the mirror_num
- * of the source drive. Therefore look it up first.
- * At the end, patch the device pointer to the one of the
- * target drive.
- */
- for (i = 0; i < tmp_num_stripes; i++) {
- if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
- continue;
-
- /*
- * In case of DUP, in order to keep it simple, only add
- * the mirror with the lowest physical address
- */
- if (found &&
- physical_of_found <= tmp_bbio->stripes[i].physical)
- continue;
-
- index_srcdev = i;
- found = 1;
- physical_of_found = tmp_bbio->stripes[i].physical;
- }
-
- btrfs_put_bbio(tmp_bbio);
-
- if (!found) {
- WARN_ON(1);
- ret = -EIO;
+ !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+ ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+ dev_replace->srcdev->devid,
+ &mirror_num,
+ &physical_to_patch_in_first_stripe);
+ if (ret)
goto out;
- }
-
- mirror_num = index_srcdev + 1;
- patch_the_first_stripe_for_dev_replace = 1;
- physical_to_patch_in_first_stripe = physical_of_found;
+ else
+ patch_the_first_stripe_for_dev_replace = 1;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
- stripe_nr_orig = stripe_nr;
- stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
- stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
- stripe_end_offset = stripe_nr_end * map->stripe_len -
- (offset + *length);
-
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->num_stripes,
- stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
- if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS)
+ if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -5549,8 +5767,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD ||
- op == BTRFS_MAP_GET_READ_MIRRORS) {
+ if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
@@ -5566,10 +5783,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
num_stripes = map->sub_stripes;
- else if (op == BTRFS_MAP_DISCARD)
- num_stripes = min_t(u64, map->sub_stripes *
- (stripe_nr_end - stripe_nr_orig),
- map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
@@ -5587,7 +5800,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
(op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
- stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
@@ -5612,8 +5825,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
- if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD &&
- op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1)
+ if ((op != BTRFS_MAP_WRITE &&
+ op != BTRFS_MAP_GET_READ_MIRRORS) &&
+ mirror_num <= 1)
mirror_num = 1;
}
} else {
@@ -5635,8 +5849,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing) {
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+ if (op == BTRFS_MAP_WRITE)
num_alloc_stripes <<= 1;
if (op == BTRFS_MAP_GET_READ_MIRRORS)
num_alloc_stripes++;
@@ -5648,14 +5862,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto out;
}
- if (dev_replace_is_ongoing)
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
/* build raid_map */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- need_raid_map &&
- ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) ||
- mirror_num > 1)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ (need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
@@ -5679,173 +5891,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
RAID6_Q_STRIPE;
}
- if (op == BTRFS_MAP_DISCARD) {
- u32 factor = 0;
- u32 sub_stripes = 0;
- u64 stripes_per_dev = 0;
- u32 remaining_stripes = 0;
- u32 last_stripe = 0;
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- sub_stripes = 1;
- else
- sub_stripes = map->sub_stripes;
-
- factor = map->num_stripes / sub_stripes;
- stripes_per_dev = div_u64_rem(stripe_nr_end -
- stripe_nr_orig,
- factor,
- &remaining_stripes);
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
- last_stripe *= sub_stripes;
- }
-
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bbio->stripes[i].dev = map->stripes[stripe_index].dev;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- bbio->stripes[i].length = stripes_per_dev *
- map->stripe_len;
-
- if (i / sub_stripes < remaining_stripes)
- bbio->stripes[i].length +=
- map->stripe_len;
-
- /*
- * Special for the first stripe and
- * the last stripe:
- *
- * |-------|...|-------|
- * |----------|
- * off end_off
- */
- if (i < sub_stripes)
- bbio->stripes[i].length -=
- stripe_offset;
-
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- sub_stripes - 1))
- bbio->stripes[i].length -=
- stripe_end_offset;
-
- if (i == sub_stripes - 1)
- stripe_offset = 0;
- } else
- bbio->stripes[i].length = *length;
-
- stripe_index++;
- if (stripe_index == map->num_stripes) {
- /* This could only happen for RAID0/10 */
- stripe_index = 0;
- stripe_nr++;
- }
- }
- } else {
- for (i = 0; i < num_stripes; i++) {
- bbio->stripes[i].physical =
- map->stripes[stripe_index].physical +
- stripe_offset +
- stripe_nr * map->stripe_len;
- bbio->stripes[i].dev =
- map->stripes[stripe_index].dev;
- stripe_index++;
- }
+ for (i = 0; i < num_stripes; i++) {
+ bbio->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ bbio->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
}
- if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
+ if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
- tgtdev_indexes = 0;
- if (dev_replace_is_ongoing &&
- (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) &&
- dev_replace->tgtdev != NULL) {
- int index_where_to_add;
- u64 srcdev_devid = dev_replace->srcdev->devid;
-
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk
- * to the new disk takes place at run time while the
- * filesystem is mounted writable, the regular write
- * operations to the old disk have to be duplicated to go
- * to the new disk as well.
- * Note that device->missing is handled by the caller, and
- * that the write to the old disk is already set up in the
- * stripes array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_bio_stripe *new =
- bbio->stripes + index_where_to_add;
- struct btrfs_bio_stripe *old =
- bbio->stripes + i;
-
- new->physical = old->physical;
- new->length = old->length;
- new->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (dev_replace_is_ongoing &&
- op == BTRFS_MAP_GET_READ_MIRRORS &&
- dev_replace->tgtdev != NULL) {
- u64 srcdev_devid = dev_replace->srcdev->devid;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
-
- /*
- * During the dev-replace procedure, the target drive can
- * also be used to read data in case it is needed to repair
- * a corrupt block elsewhere. This is possible if the
- * requested area is left of the left cursor. In this area,
- * the target drive is a full copy of the source drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bbio->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it
- * simple, only add the mirror with the
- * lowest physical address
- */
- if (found &&
- physical_of_found <=
- bbio->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bbio->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_bio_stripe *tgtdev_stripe =
- bbio->stripes + num_stripes;
-
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bbio->stripes[index_srcdev].length;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bbio->tgtdev_map[index_srcdev] = num_stripes;
-
- tgtdev_indexes++;
- num_stripes++;
- }
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ need_full_stripe(op)) {
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
+ &max_errors);
}
*bbio_ret = bbio;
@@ -5853,7 +5919,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
- bbio->num_tgtdevs = tgtdev_indexes;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5886,19 +5951,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map)
+ struct btrfs_bio **bbio_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
- mirror_num, need_raid_map);
+ return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
{
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map_tree *em_tree = &map_tree->map_tree;
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
@@ -5908,24 +5969,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 rmap_len;
int i, j, nr = 0;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_start, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
- btrfs_err(fs_info, "couldn't find em for chunk %Lu",
- chunk_start);
+ em = get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
return -EIO;
- }
- if (em->start != chunk_start) {
- btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu",
- em->start, chunk_start);
- free_extent_map(em);
- return -EIO;
- }
map = em->map_lookup;
-
length = em->len;
rmap_len = map->stripe_len;
@@ -5949,7 +5997,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
continue;
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 59be81206dd7..c7d0fbc915ca 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -123,7 +123,6 @@ struct btrfs_device {
struct list_head resized_list;
/* for sending down flush barriers */
- int nobarriers;
struct bio *flush_bio;
struct completion flush_wait;
@@ -298,7 +297,7 @@ struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
struct btrfs_bio {
- atomic_t refs;
+ refcount_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
@@ -400,8 +399,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
- struct btrfs_bio **bbio_ret, int mirror_num,
- int need_raid_map);
+ struct btrfs_bio **bbio_ret);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
@@ -475,7 +473,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
-int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct btrfs_mapping_tree *map_tree,
diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..161be58c5cb0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,6 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags,
struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -1830,7 +1829,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1884,7 +1883,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -2379,8 +2378,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
goto out;
err = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
- &page, &fsdata);
+ AOP_FLAG_CONT_EXPAND, &page, &fsdata);
if (err)
goto out;
@@ -2415,9 +2413,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ &page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
@@ -2449,9 +2446,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ &page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
@@ -3095,7 +3091,7 @@ void guard_bio_eod(int op, struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags, struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct bio *bio;
@@ -3130,7 +3126,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(op, bio);
@@ -3145,16 +3140,9 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
return 0;
}
-int _submit_bh(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags)
+int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
-}
-EXPORT_SYMBOL_GPL(_submit_bh);
-
-int submit_bh(int op, int op_flags, struct buffer_head *bh)
-{
- return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+ return submit_bh_wbc(op, op_flags, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1a3e1b40799a..1e71e6ca5ddf 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req)
bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc);
- if (rc < 0)
+ if (rc < 0) {
mapping_set_error(mapping, rc);
+ ceph_set_error_write(ci);
+ } else {
+ ceph_clear_error_write(ci);
+ }
/*
* We lost the cache cap, need to truncate the page before
@@ -700,12 +704,9 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
+ clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
- if (rc < 0)
- SetPageError(page);
-
ceph_put_snap_context(page_snap_context(page));
page->private = 0;
ClearPagePrivate(page);
@@ -979,7 +980,7 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
- set_bdi_congested(&fsc->backing_dev_info,
+ set_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
}
@@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ wr_req->r_abort_on_full = true;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 68c78be19d5b..a3ebb632294e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
void *p;
size_t extra_len;
struct timespec zerotime = {0};
+ struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
@@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg)
ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */
ceph_encode_32(&p, 0);
- /* osd_epoch_barrier (version 5) */
- ceph_encode_32(&p, 0);
+ /*
+ * osd_epoch_barrier (version 5)
+ * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
+ * case it was recently changed
+ */
+ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
/* oldest_flush_tid (version 6) */
ceph_encode_64(&p, arg->oldest_flush_tid);
@@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
first_tid = cf->tid + 1;
capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
- atomic_inc(&capsnap->nref);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
dout("__flush_snaps %p capsnap %p tid %llu %s\n",
@@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
inode, capsnap, cf->tid,
ceph_cap_string(capsnap->dirty));
- atomic_inc(&capsnap->nref);
+ refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
@@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 5) {
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
+ u32 epoch_barrier;
+
+ ceph_decode_32_safe(&p, end, epoch_barrier, bad);
+ ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
+ }
+
if (le16_to_cpu(msg->hdr.version) >= 8) {
u64 flush_tid;
u32 caller_uid, caller_gid;
- u32 osd_epoch_barrier;
u32 pool_ns_len;
- /* version >= 5 */
- ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+
/* version >= 6 */
ceph_decode_64_safe(&p, end, flush_tid, bad);
/* version >= 7 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2ae393e2c31..4e2d112c982f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_fs_client *fsc = s->private;
+ struct ceph_mdsmap *mdsmap;
if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0;
- seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
- seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
- seq_printf(s, "session_timeout %d\n",
- fsc->mdsc->mdsmap->m_session_timeout);
- seq_printf(s, "session_autoclose %d\n",
- fsc->mdsc->mdsmap->m_session_autoclose);
- for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
- struct ceph_entity_addr *addr =
- &fsc->mdsc->mdsmap->m_info[i].addr;
- int state = fsc->mdsc->mdsmap->m_info[i].state;
-
+ mdsmap = fsc->mdsc->mdsmap;
+ seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", mdsmap->m_root);
+ seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+ seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+ for (i = 0; i < mdsmap->m_num_mds; i++) {
+ struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+ int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state));
@@ -251,7 +250,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->backing_dev_info.dev));
+ dev_name(fsc->sb->s_bdi->dev));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e9ad501addf..e071d23f6148 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_mds_client *mdsc = fsc->mdsc;
int i;
int err;
- u32 ftype;
+ unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo;
dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
@@ -341,7 +341,6 @@ more:
/* do we have the correct frag content buffered? */
if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req;
- unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
@@ -352,8 +351,11 @@ more:
}
if (is_hash_order(ctx->pos)) {
- frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
- NULL, NULL);
+ /* fragtree isn't always accurate. choose frag
+ * based on previous reply when possible. */
+ if (frag == (unsigned)-1)
+ frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+ NULL, NULL);
} else {
frag = fpos_frag(ctx->pos);
}
@@ -378,7 +380,11 @@ more:
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+ } else if (is_hash_order(ctx->pos)) {
+ req->r_args.readdir.offset_hash =
+ cpu_to_le32(fpos_hash(ctx->pos));
}
+
req->r_dir_release_cnt = fi->dir_release_count;
req->r_dir_ordered_cnt = fi->dir_ordered_count;
req->r_readdir_cache_idx = fi->readdir_cache_idx;
@@ -476,6 +482,7 @@ more:
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino;
ino_t ino;
+ u32 ftype;
BUG_ON(rde->offset < ctx->pos);
@@ -498,15 +505,17 @@ more:
ctx->pos++;
}
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+
if (fi->next_offset > 2) {
- ceph_mdsc_put_request(fi->last_readdir);
- fi->last_readdir = NULL;
+ frag = fi->frag;
goto more;
}
/* more frags? */
if (!ceph_frag_is_rightmost(fi->frag)) {
- unsigned frag = ceph_frag_next(fi->frag);
+ frag = ceph_frag_next(fi->frag);
if (is_hash_order(ctx->pos)) {
loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
fi->next_offset, true);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 26cc95421cca..3fdde0b283c9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -13,6 +13,38 @@
#include "mds_client.h"
#include "cache.h"
+static __le32 ceph_flags_sys2wire(u32 flags)
+{
+ u32 wire_flags = 0;
+
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ wire_flags |= CEPH_O_RDONLY;
+ break;
+ case O_WRONLY:
+ wire_flags |= CEPH_O_WRONLY;
+ break;
+ case O_RDWR:
+ wire_flags |= CEPH_O_RDWR;
+ break;
+ }
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+ ceph_sys2wire(O_CREAT);
+ ceph_sys2wire(O_EXCL);
+ ceph_sys2wire(O_TRUNC);
+ ceph_sys2wire(O_DIRECTORY);
+ ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+ if (flags)
+ dout("unused open flags: %x", flags);
+
+ return cpu_to_le32(wire_flags);
+}
+
/*
* Ceph file operations
*
@@ -74,12 +106,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
(PAGE_SIZE - 1);
npages = calc_pages_for(align, nbytes);
- pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
- if (!pages) {
- pages = vmalloc(sizeof(*pages) * npages);
- if (!pages)
- return ERR_PTR(-ENOMEM);
- }
+ pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
for (idx = 0; idx < npages; ) {
size_t start;
@@ -123,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req))
goto out;
req->r_fmode = ceph_flags_to_mode(flags);
- req->r_args.open.flags = cpu_to_le32(flags);
+ req->r_args.open.flags = ceph_flags_sys2wire(flags);
req->r_args.open.mode = cpu_to_le32(create_mode);
out:
return req;
@@ -192,7 +221,7 @@ int ceph_renew_caps(struct inode *inode)
spin_lock(&ci->i_ceph_lock);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
- (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+ (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
@@ -781,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_callback = ceph_aio_complete_req;
req->r_inode = inode;
req->r_priv = aio_req;
+ req->r_abort_on_full = true;
ret = ceph_osdc_start_request(req->r_osdc, req, false);
out:
@@ -1088,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
out:
ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
-
- if (pos > i_size_read(inode)) {
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
- }
- } else
+ if (ret != 0) {
+ ceph_set_error_write(ci);
break;
+ }
+
+ ceph_clear_error_write(ci);
+ pos += len;
+ written += len;
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+
}
if (ret != -EOLDSNAPC && written > 0) {
@@ -1306,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
retry_snap:
+ /* FIXME: not complete since it doesn't account for being at quota */
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
goto out;
@@ -1327,7 +1361,8 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
+ (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
inode_unlock(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d449e1c03cbd..dcce79b84406 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session);
- if (rinfo->hash_order && req->r_path2) {
- last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
- req->r_path2, strlen(req->r_path2));
- last_hash = ceph_frag_value(last_hash);
+ if (rinfo->hash_order) {
+ if (req->r_path2) {
+ last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ req->r_path2,
+ strlen(req->r_path2));
+ last_hash = ceph_frag_value(last_hash);
+ } else if (rinfo->offset_hash) {
+ /* mds understands offset_hash */
+ WARN_ON_ONCE(req->r_readdir_offset != 2);
+ last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+ }
}
if (rinfo->dir_dir &&
@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
- !(rinfo->hash_order && req->r_path2)) {
+ !(rinfo->hash_order && last_hash)) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
@@ -2071,11 +2078,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
- if (ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(inode, attr->ia_mode);
- if (err)
- goto out_put;
- }
if (mask) {
req->r_inode = inode;
@@ -2089,13 +2091,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- if (mask & CEPH_SETATTR_SIZE)
- __ceph_do_pending_vmtruncate(inode);
- ceph_free_cap_flush(prealloc_cf);
- return err;
-out_put:
- ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
+
+ if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
+ __ceph_do_pending_vmtruncate(inode);
+
return err;
}
@@ -2114,7 +2114,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
- return __ceph_setattr(inode, attr);
+ err = __ceph_setattr(inode, attr);
+
+ if (err >= 0 && (attr->ia_valid & ATTR_MODE))
+ err = posix_acl_chmod(inode, attr->ia_mode);
+
+ return err;
}
/*
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c681762d76e6..f38e56fa9712 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+ info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
}
if (num == 0)
goto done;
@@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s)
static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
{
- if (atomic_inc_not_zero(&s->s_ref)) {
+ if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+ refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
return s;
} else {
dout("mdsc get_session %p 0 -- FAIL", s);
@@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
void ceph_put_mds_session(struct ceph_mds_session *s)
{
dout("mdsc put_session %p %d -> %d\n", s,
- atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
- if (atomic_dec_and_test(&s->s_ref)) {
+ refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
+ if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
kfree(s);
@@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
return NULL;
session = mdsc->sessions[mds];
dout("lookup_mds_session %p %d\n", session,
- atomic_read(&session->s_ref));
+ refcount_read(&session->s_ref));
get_session(session);
return session;
}
@@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->m_num_mds)
return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_caps);
s->s_nr_caps = 0;
s->s_trim_caps = 0;
- atomic_set(&s->s_ref, 1);
+ refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
s->s_num_cap_releases = 0;
@@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
}
mdsc->sessions[mds] = s;
atomic_inc(&mdsc->num_sessions);
- atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+ refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
@@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- if (mds >= mdsc->mdsmap->m_max_mds)
+ if (mds >= mdsc->mdsmap->m_num_mds)
return;
mi = &mdsc->mdsmap->m_info[mds];
@@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
+ struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
struct ceph_cap *cap;
LIST_HEAD(tmp_list);
int num_cap_releases;
+ __le32 barrier, *cap_barrier;
+
+ down_read(&osdc->lock);
+ barrier = cpu_to_le32(osdc->epoch_barrier);
+ up_read(&osdc->lock);
spin_lock(&session->s_cap_lock);
again:
@@ -1571,7 +1578,11 @@ again:
head = msg->front.iov_base;
head->num = cpu_to_le32(0);
msg->front.iov_len = sizeof(*head);
+
+ msg->hdr.version = cpu_to_le16(2);
+ msg->hdr.compat_version = cpu_to_le16(1);
}
+
cap = list_first_entry(&tmp_list, struct ceph_cap,
session_caps);
list_del(&cap->session_caps);
@@ -1589,6 +1600,11 @@ again:
ceph_put_cap(mdsc, cap);
if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1604,6 +1620,11 @@ again:
spin_unlock(&session->s_cap_lock);
if (msg) {
+ // Append cap_barrier field
+ cap_barrier = msg->front.iov_base + msg->front.iov_len;
+ *cap_barrier = barrier;
+ msg->front.iov_len += sizeof(*cap_barrier);
+
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
@@ -1666,6 +1687,7 @@ struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct timespec ts;
if (!req)
return ERR_PTR(-ENOMEM);
@@ -1684,7 +1706,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_stamp = current_fs_time(mdsc->fsc->sb);
+ ktime_get_real_ts(&ts);
+ req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
req->r_op = op;
req->r_direct_mode = mode;
@@ -1991,7 +2014,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
- atomic_inc(&pagelist->refcnt);
+ refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
@@ -2638,8 +2661,10 @@ static void handle_session(struct ceph_mds_session *session,
seq = le64_to_cpu(h->seq);
mutex_lock(&mdsc->mutex);
- if (op == CEPH_SESSION_CLOSE)
+ if (op == CEPH_SESSION_CLOSE) {
+ get_session(session);
__unregister_session(mdsc, session);
+ }
/* FIXME: this ttl calculation is generous */
session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
mutex_unlock(&mdsc->mutex);
@@ -2728,6 +2753,8 @@ static void handle_session(struct ceph_mds_session *session,
kick_requests(mdsc, mds);
mutex_unlock(&mdsc->mutex);
}
+ if (op == CEPH_SESSION_CLOSE)
+ ceph_put_mds_session(session);
return;
bad:
@@ -3107,7 +3134,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
- for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i] == NULL)
continue;
s = mdsc->sessions[i];
@@ -3121,15 +3148,33 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state));
- if (i >= newmap->m_max_mds ||
+ if (i >= newmap->m_num_mds ||
memcmp(ceph_mdsmap_get_addr(oldmap, i),
ceph_mdsmap_get_addr(newmap, i),
sizeof(struct ceph_entity_addr))) {
if (s->s_state == CEPH_MDS_SESSION_OPENING) {
/* the session never opened, just close it
* out now */
+ get_session(s);
+ __unregister_session(mdsc, s);
__wake_requests(mdsc, &s->s_waiting);
+ ceph_put_mds_session(s);
+ } else if (i >= newmap->m_num_mds) {
+ /* force close session for stopped mds */
+ get_session(s);
__unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ kick_requests(mdsc, i);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
+
+ ceph_put_mds_session(s);
+
+ mutex_lock(&mdsc->mutex);
} else {
/* just close it */
mutex_unlock(&mdsc->mutex);
@@ -3167,7 +3212,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
}
- for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
continue;
@@ -3881,7 +3926,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+ dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
return con;
}
dout("mdsc con_get %p FAIL\n", s);
@@ -3892,7 +3937,7 @@ static void con_put(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1);
+ dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
ceph_put_mds_session(s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ac0475a2daa7..db57ae98ed34 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -7,6 +7,7 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
+#include <linux/refcount.h>
#include <linux/ceph/types.h>
#include <linux/ceph/messenger.h>
@@ -82,9 +83,10 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_dirfrag *dir_dir;
size_t dir_buf_size;
int dir_nr;
- bool dir_complete;
bool dir_end;
+ bool dir_complete;
bool hash_order;
+ bool offset_hash;
struct ceph_mds_reply_dir_entry *dir_entries;
};
@@ -104,10 +106,13 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
+ *
+ * Account for per-message overhead of mds_cap_release header
+ * and __le32 for osd epoch barrier trailing field.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \
sizeof(struct ceph_mds_cap_release)) / \
- sizeof(struct ceph_mds_cap_item))
+ sizeof(struct ceph_mds_cap_item))
/*
@@ -156,7 +161,7 @@ struct ceph_mds_session {
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- atomic_t s_ref;
+ refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -373,7 +378,7 @@ __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
static inline struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s)
{
- atomic_inc(&s->s_ref);
+ refcount_inc(&s->s_ref);
return s;
}
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 5454e2327a5f..1a748cf88535 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
int i;
/* special case for one mds */
- if (1 == m->m_max_mds && m->m_info[0].state > 0)
+ if (1 == m->m_num_mds && m->m_info[0].state > 0)
return 0;
/* count */
- for (i = 0; i < m->m_max_mds; i++)
+ for (i = 0; i < m->m_num_mds; i++)
if (m->m_info[i].state > 0)
n++;
if (n == 0)
@@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
+ m->m_num_mds = m->m_max_mds;
- m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+ m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
if (m->m_info == NULL)
goto nomem;
@@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
- if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+ if (mds < 0 || state <= 0)
continue;
+ if (mds >= m->m_num_mds) {
+ int new_num = max(mds + 1, m->m_num_mds * 2);
+ void *new_m_info = krealloc(m->m_info,
+ new_num * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ m->m_num_mds = new_num;
+ }
+
info = &m->m_info[mds];
info->global_id = global_id;
info->state = state;
@@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL;
}
}
+ if (m->m_num_mds > m->m_max_mds) {
+ /* find max up mds */
+ for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
+ if (i == 0 || m->m_info[i-1].state > 0)
+ break;
+ }
+ m->m_num_mds = i;
+ }
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
@@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p);
- if (mds >= 0 && mds < m->m_max_mds) {
+ if (mds >= 0 && mds < m->m_num_mds) {
if (m->m_info[mds].laggy)
num_laggy++;
}
}
m->m_num_laggy = num_laggy;
+
+ if (n > m->m_num_mds) {
+ void *new_m_info = krealloc(m->m_info,
+ n * sizeof(*m->m_info),
+ GFP_NOFS | __GFP_ZERO);
+ if (!new_m_info)
+ goto nomem;
+ m->m_info = new_m_info;
+ }
+ m->m_num_mds = n;
}
/* inc */
@@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_max_mds; i++)
+ for (i = 0; i < m->m_num_mds; i++)
kfree(m->m_info[i].export_targets);
kfree(m->m_info);
kfree(m->m_data_pg_pools);
@@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false;
if (m->m_num_laggy > 0)
return false;
- for (i = 0; i < m->m_max_mds; i++) {
+ for (i = 0; i < m->m_num_mds; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++;
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 8f8b41c2ef0f..dab5d6732345 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -519,7 +519,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- atomic_set(&capsnap->nref, 1);
+ refcount_set(&capsnap->nref, 1);
INIT_LIST_HEAD(&capsnap->ci_item);
capsnap->follows = old_snapc->seq;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ec8d0114e57..8d7918ce694a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- const u64 supported_features =
- CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
- CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
- const u64 required_features = 0;
int page_count;
size_t size;
int err = -ENOMEM;
@@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc)
return ERR_PTR(-ENOMEM);
- fsc->client = ceph_create_client(opt, fsc, supported_features,
- required_features);
+ fsc->client = ceph_create_client(opt, fsc);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
@@ -579,10 +574,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
atomic_long_set(&fsc->writeback_count, 0);
- err = bdi_init(&fsc->backing_dev_info);
- if (err < 0)
- goto fail_client;
-
err = -ENOMEM;
/*
* The number of concurrent works can be high but they don't need
@@ -590,7 +581,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
*/
fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
- goto fail_bdi;
+ goto fail_client;
fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
@@ -624,8 +615,6 @@ fail_pg_inv_wq:
destroy_workqueue(fsc->pg_inv_wq);
fail_wb_wq:
destroy_workqueue(fsc->wb_wq);
-fail_bdi:
- bdi_destroy(&fsc->backing_dev_info);
fail_client:
ceph_destroy_client(fsc->client);
fail:
@@ -643,8 +632,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
- bdi_destroy(&fsc->backing_dev_info);
-
mempool_destroy(fsc->wb_pagevec_pool);
destroy_mount_options(fsc->mount_options);
@@ -937,33 +924,32 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb,
- struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
{
int err;
+ err = super_setup_bdi_name(sb, "ceph-%ld",
+ atomic_long_inc_return(&bdi_seq));
+ if (err)
+ return err;
+
/* set ra_pages based on rasize mount option? */
if (fsc->mount_options->rasize >= PAGE_SIZE)
- fsc->backing_dev_info.ra_pages =
+ sb->s_bdi->ra_pages =
(fsc->mount_options->rasize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else
- fsc->backing_dev_info.ra_pages =
- VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
fsc->mount_options->rsize >= PAGE_SIZE)
- fsc->backing_dev_info.io_pages =
+ sb->s_bdi->io_pages =
(fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else if (fsc->mount_options->rsize == 0)
- fsc->backing_dev_info.io_pages = ULONG_MAX;
+ sb->s_bdi->io_pages = ULONG_MAX;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
- atomic_long_inc_return(&bdi_seq));
- if (!err)
- sb->s_bdi = &fsc->backing_dev_info;
- return err;
+ return 0;
}
static struct dentry *ceph_mount(struct file_system_type *fs_type,
@@ -1018,7 +1004,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
- err = ceph_register_bdi(sb, fsc);
+ err = ceph_setup_bdi(sb, fsc);
if (err < 0) {
res = ERR_PTR(err);
goto out_splat;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe6b9cfc4013..a973acd8beaf 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -14,6 +14,7 @@
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/posix_acl.h>
+#include <linux/refcount.h>
#include <linux/ceph/libceph.h>
@@ -92,8 +93,6 @@ struct ceph_fs_client {
struct workqueue_struct *trunc_wq;
atomic_long_t writeback_count;
- struct backing_dev_info backing_dev_info;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
@@ -162,7 +161,7 @@ struct ceph_cap_flush {
* data before flushing the snapped state (tracked here) back to the MDS.
*/
struct ceph_cap_snap {
- atomic_t nref;
+ refcount_t nref;
struct list_head ci_item;
struct ceph_cap_flush cap_flush;
@@ -191,7 +190,7 @@ struct ceph_cap_snap {
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{
- if (atomic_dec_and_test(&capsnap->nref)) {
+ if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
kfree(capsnap);
@@ -473,6 +472,32 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
+#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */
+
+/*
+ * We set the ERROR_WRITE bit when we start seeing write errors on an inode
+ * and then clear it when they start succeeding. Note that we do a lockless
+ * check first, and only take the lock if it looks like it needs to be changed.
+ * The write submission code just takes this as a hint, so we're not too
+ * worried if a few slip through in either direction.
+ */
+static inline void ceph_set_error_write(struct ceph_inode_info *ci)
+{
+ if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags |= CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
+static inline void ceph_clear_error_write(struct ceph_inode_info *ci)
+{
+ if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) {
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index febc28f9e2c2..75267cdd5dfd 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (update_xattr) {
int err = 0;
+
if (xattr && (flags & XATTR_CREATE))
err = -EEXIST;
else if (!xattr && (flags & XATTR_REPLACE))
@@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (err) {
kfree(name);
kfree(val);
+ kfree(*newxattr);
return err;
}
if (update_xattr < 0) {
if (xattr)
__remove_xattr(ci, xattr);
kfree(name);
+ kfree(*newxattr);
return 0;
}
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 44a240c4bb65..fb8507f521b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -471,6 +471,85 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
return 0;
}
+/**
+ * cdev_set_parent() - set the parent kobject for a char device
+ * @p: the cdev structure
+ * @kobj: the kobject to take a reference to
+ *
+ * cdev_set_parent() sets a parent kobject which will be referenced
+ * appropriately so the parent is not freed before the cdev. This
+ * should be called before cdev_add.
+ */
+void cdev_set_parent(struct cdev *p, struct kobject *kobj)
+{
+ WARN_ON(!kobj->state_initialized);
+ p->kobj.parent = kobj;
+}
+
+/**
+ * cdev_device_add() - add a char device and it's corresponding
+ * struct device, linkink
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_add() adds the char device represented by @cdev to the system,
+ * just as cdev_add does. It then adds @dev to the system using device_add
+ * The dev_t for the char device will be taken from the struct device which
+ * needs to be initialized first. This helper function correctly takes a
+ * reference to the parent device so the parent will not get released until
+ * all references to the cdev are released.
+ *
+ * This helper uses dev->devt for the device number. If it is not set
+ * it will not add the cdev and it will be equivalent to device_add.
+ *
+ * This function should be used whenever the struct cdev and the
+ * struct device are members of the same structure whose lifetime is
+ * managed by the struct device.
+ *
+ * NOTE: Callers must assume that userspace was able to open the cdev and
+ * can call cdev fops callbacks at any time, even if this function fails.
+ */
+int cdev_device_add(struct cdev *cdev, struct device *dev)
+{
+ int rc = 0;
+
+ if (dev->devt) {
+ cdev_set_parent(cdev, &dev->kobj);
+
+ rc = cdev_add(cdev, dev->devt, 1);
+ if (rc)
+ return rc;
+ }
+
+ rc = device_add(dev);
+ if (rc)
+ cdev_del(cdev);
+
+ return rc;
+}
+
+/**
+ * cdev_device_del() - inverse of cdev_device_add
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_del() is a helper function to call cdev_del and device_del.
+ * It should be used whenever cdev_device_add is used.
+ *
+ * If dev->devt is not set it will not remove the cdev and will be equivalent
+ * to device_del.
+ *
+ * NOTE: This guarantees that associated sysfs callbacks are not running
+ * or runnable, however any cdevs already open will remain and their fops
+ * will still be callable even after this function returns.
+ */
+void cdev_device_del(struct cdev *cdev, struct device *dev)
+{
+ device_del(dev);
+ if (dev->devt)
+ cdev_del(cdev);
+}
+
static void cdev_unmap(dev_t dev, unsigned count)
{
kobj_unmap(cdev_map, dev, count);
@@ -482,6 +561,10 @@ static void cdev_unmap(dev_t dev, unsigned count)
*
* cdev_del() removes @p from the system, possibly freeing the structure
* itself.
+ *
+ * NOTE: This guarantees that cdev device will no longer be able to be
+ * opened, however any cdevs already open will remain and their fops will
+ * still be callable even after cdev_del returns.
*/
void cdev_del(struct cdev *p)
{
@@ -570,5 +653,8 @@ EXPORT_SYMBOL(cdev_init);
EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_set_parent);
+EXPORT_SYMBOL(cdev_device_add);
+EXPORT_SYMBOL(cdev_device_del);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 07ed81cf1552..cbd216b57239 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -68,7 +68,6 @@ struct cifs_sb_info {
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
char *mountdata; /* options received at mount time or via DFS refs */
- struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
char *prepath;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 02b071bf3732..a0b3e7d1be48 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target)
case SFM_COLON:
*target = ':';
break;
+ case SFM_DOUBLEQUOTE:
+ *target = '"';
+ break;
case SFM_ASTERISK:
*target = '*';
break;
@@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
case ':':
dest_char = cpu_to_le16(SFM_COLON);
break;
+ case '"':
+ dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
+ break;
case '*':
dest_char = cpu_to_le16(SFM_ASTERISK);
break;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 3d7298cc0aeb..8a79a34e66b8 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -57,6 +57,7 @@
* not conflict (although almost does) with the mapping above.
*/
+#define SFM_DOUBLEQUOTE ((__u16) 0xF020)
#define SFM_ASTERISK ((__u16) 0xF021)
#define SFM_QUESTION ((__u16) 0xF025)
#define SFM_COLON ((__u16) 0xF022)
@@ -64,8 +65,8 @@
#define SFM_LESSTHAN ((__u16) 0xF023)
#define SFM_PIPE ((__u16) 0xF027)
#define SFM_SLASH ((__u16) 0xF026)
-#define SFM_PERIOD ((__u16) 0xF028)
-#define SFM_SPACE ((__u16) 0xF029)
+#define SFM_SPACE ((__u16) 0xF028)
+#define SFM_PERIOD ((__u16) 0xF029)
/*
* Mapping mechanism to use when one of the seven reserved characters is
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 058ac9b36f04..68abbb0db608 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -478,6 +478,7 @@ find_timestamp(struct cifs_ses *ses)
unsigned char *blobptr;
unsigned char *blobend;
struct ntlmssp2_name *attrptr;
+ struct timespec ts;
if (!ses->auth_key.len || !ses->auth_key.response)
return 0;
@@ -502,7 +503,8 @@ find_timestamp(struct cifs_ses *ses)
blobptr += attrsize; /* advance attr value */
}
- return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+ ktime_get_real_ts(&ts);
+ return cpu_to_le64(cifs_UnixTimeToNT(ts));
}
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index dd3f5fabfdf6..9a1667e0e8d6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -37,6 +37,7 @@
#include <linux/freezer.h>
#include <linux/namei.h>
#include <linux/random.h>
+#include <linux/uuid.h>
#include <linux/xattr.h>
#include <net/ipv6.h>
#include "cifsfs.h"
@@ -87,6 +88,7 @@ extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
+struct workqueue_struct *cifsoplockd_wq;
__u32 cifs_lock_secret;
/*
@@ -138,7 +140,12 @@ cifs_read_super(struct super_block *sb)
sb->s_magic = CIFS_MAGIC_NUMBER;
sb->s_op = &cifs_super_ops;
sb->s_xattr = cifs_xattr_handlers;
- sb->s_bdi = &cifs_sb->bdi;
+ rc = super_setup_bdi(sb);
+ if (rc)
+ goto out_no_root;
+ /* tune readahead according to rsize */
+ sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE;
+
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
inode = cifs_root_iget(sb);
@@ -1369,9 +1376,16 @@ init_cifs(void)
goto out_clean_proc;
}
+ cifsoplockd_wq = alloc_workqueue("cifsoplockd",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!cifsoplockd_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_cifsiod_wq;
+ }
+
rc = cifs_fscache_register();
if (rc)
- goto out_destroy_wq;
+ goto out_destroy_cifsoplockd_wq;
rc = cifs_init_inodecache();
if (rc)
@@ -1419,7 +1433,9 @@ out_destroy_inodecache:
cifs_destroy_inodecache();
out_unreg_fscache:
cifs_fscache_unregister();
-out_destroy_wq:
+out_destroy_cifsoplockd_wq:
+ destroy_workqueue(cifsoplockd_wq);
+out_destroy_cifsiod_wq:
destroy_workqueue(cifsiod_wq);
out_clean_proc:
cifs_proc_clean();
@@ -1442,6 +1458,7 @@ exit_cifs(void)
cifs_destroy_mids();
cifs_destroy_inodecache();
cifs_fscache_unregister();
+ destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 37f5a41cc50c..8be55be70faf 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1115,6 +1115,23 @@ struct cifs_io_parms {
struct cifs_tcon *tcon;
};
+struct cifs_aio_ctx {
+ struct kref refcount;
+ struct list_head list;
+ struct mutex aio_mutex;
+ struct completion done;
+ struct iov_iter iter;
+ struct kiocb *iocb;
+ struct cifsFileInfo *cfile;
+ struct bio_vec *bv;
+ loff_t pos;
+ unsigned int npages;
+ ssize_t rc;
+ unsigned int len;
+ unsigned int total_len;
+ bool should_dirty;
+};
+
struct cifs_readdata;
/* asynchronous read support */
@@ -1124,6 +1141,7 @@ struct cifs_readdata {
struct completion done;
struct cifsFileInfo *cfile;
struct address_space *mapping;
+ struct cifs_aio_ctx *ctx;
__u64 offset;
unsigned int bytes;
unsigned int got_bytes;
@@ -1154,6 +1172,7 @@ struct cifs_writedata {
enum writeback_sync_modes sync_mode;
struct work_struct work;
struct cifsFileInfo *cfile;
+ struct cifs_aio_ctx *ctx;
__u64 offset;
pid_t pid;
unsigned int bytes;
@@ -1683,6 +1702,7 @@ void cifs_oplock_break(struct work_struct *work);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
+extern struct workqueue_struct *cifsoplockd_wq;
extern __u32 cifs_lock_secret;
extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 97e5d236d265..e49958c3f8bb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -535,4 +535,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
struct shash_desc *shash);
enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
+void cifs_aio_ctx_release(struct kref *refcount);
+int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5d21f00ae341..4c01b3f9abf0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -478,14 +478,14 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
* this requirement.
*/
int val, seconds, remain, result;
- struct timespec ts, utc;
- utc = CURRENT_TIME;
+ struct timespec ts;
+ unsigned long utc = ktime_get_real_seconds();
ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
rsp->SrvTime.Time, 0);
cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
- (int)ts.tv_sec, (int)utc.tv_sec,
- (int)(utc.tv_sec - ts.tv_sec));
- val = (int)(utc.tv_sec - ts.tv_sec);
+ (int)ts.tv_sec, (int)utc,
+ (int)(utc - ts.tv_sec));
+ val = (int)(utc - ts.tv_sec);
seconds = abs(val);
result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
remain = seconds % MIN_TZ_ADJ;
@@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
if (rc)
return rc;
+ if (server->capabilities & CAP_UNICODE)
+ smb->hdr.Flags2 |= SMBFLG2_UNICODE;
+
/* set up echo request */
smb->hdr.Tid = 0xffff;
smb->hdr.WordCount = 1;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d82467cfb0e2..9365c0cf77ad 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
#include <linux/pagevec.h>
#include <linux/freezer.h>
#include <linux/namei.h>
+#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/inet.h>
@@ -1945,9 +1946,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
if (!got_ip) {
+ int len;
+ const char *slash;
+
/* No ip= option specified? Try to get it from UNC */
- if (!cifs_convert_address(dstaddr, &vol->UNC[2],
- strlen(&vol->UNC[2]))) {
+ /* Use the address part of the UNC. */
+ slash = strchr(&vol->UNC[2], '\\');
+ len = slash - &vol->UNC[2];
+ if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) {
pr_err("Unable to determine destination address.\n");
goto cifs_parse_mount_err;
}
@@ -2912,16 +2918,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
{
struct cifs_sb_info *old = CIFS_SB(sb);
struct cifs_sb_info *new = mnt_data->cifs_sb;
+ bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
+ bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
- if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
- if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
- return 0;
- /* The prepath should be null terminated strings */
- if (strcmp(new->prepath, old->prepath))
- return 0;
-
+ if (old_set && new_set && !strcmp(new->prepath, old->prepath))
return 1;
- }
+ else if (!old_set && !new_set)
+ return 1;
+
return 0;
}
@@ -3692,10 +3696,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
int referral_walks_count = 0;
#endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DFS_UPCALL
try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3723,7 +3723,6 @@ try_mount_again:
server = cifs_get_tcp_session(volume_info);
if (IS_ERR(server)) {
rc = PTR_ERR(server);
- bdi_destroy(&cifs_sb->bdi);
goto out;
}
if ((volume_info->max_credits < 20) ||
@@ -3780,9 +3779,6 @@ try_mount_again:
cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
- /* tune readahead according to rsize */
- cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
-
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3899,7 +3895,6 @@ mount_fail_check:
cifs_put_smb_ses(ses);
else
cifs_put_tcp_session(server, 0);
- bdi_destroy(&cifs_sb->bdi);
}
out:
@@ -4102,7 +4097,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
- bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
call_rcu(&cifs_sb->rcu, delayed_free);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 21d404535739..6ef78ad838e6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2458,11 +2458,14 @@ cifs_uncached_writedata_release(struct kref *refcount)
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+ kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < wdata->nr_pages; i++)
put_page(wdata->pages[i]);
cifs_writedata_release(refcount);
}
+static void collect_uncached_write_data(struct cifs_aio_ctx *ctx);
+
static void
cifs_uncached_writev_complete(struct work_struct *work)
{
@@ -2478,7 +2481,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
spin_unlock(&inode->i_lock);
complete(&wdata->done);
-
+ collect_uncached_write_data(wdata->ctx);
+ /* the below call can possibly free the last ref to aio ctx */
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
@@ -2527,7 +2531,8 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
static int
cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
+ struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
+ struct cifs_aio_ctx *ctx)
{
int rc = 0;
size_t cur_len;
@@ -2595,6 +2600,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pagesz = PAGE_SIZE;
wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
wdata->credits = credits;
+ wdata->ctx = ctx;
+ kref_get(&ctx->refcount);
if (!wdata->cfile->invalidHandle ||
!(rc = cifs_reopen_file(wdata->cfile, false)))
@@ -2620,81 +2627,61 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
return rc;
}
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
{
- struct file *file = iocb->ki_filp;
- ssize_t total_written = 0;
- struct cifsFileInfo *open_file;
+ struct cifs_writedata *wdata, *tmp;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
- struct cifs_writedata *wdata, *tmp;
- struct list_head wdata_list;
- struct iov_iter saved_from = *from;
+ struct dentry *dentry = ctx->cfile->dentry;
+ unsigned int i;
int rc;
- /*
- * BB - optimize the way when signing is disabled. We can drop this
- * extra memory-to-memory copying and use iovec buffers for constructing
- * write request.
- */
-
- rc = generic_write_checks(iocb, from);
- if (rc <= 0)
- return rc;
-
- INIT_LIST_HEAD(&wdata_list);
- cifs_sb = CIFS_FILE_SB(file);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
+ tcon = tlink_tcon(ctx->cfile->tlink);
+ cifs_sb = CIFS_SB(dentry->d_sb);
- rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
- open_file, cifs_sb, &wdata_list);
+ mutex_lock(&ctx->aio_mutex);
- /*
- * If at least one write was successfully sent, then discard any rc
- * value from the later writes. If the other write succeeds, then
- * we'll end up returning whatever was written. If it fails, then
- * we'll get a new rc value from that.
- */
- if (!list_empty(&wdata_list))
- rc = 0;
+ if (list_empty(&ctx->list)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+ rc = ctx->rc;
/*
* Wait for and collect replies for any successful sends in order of
- * increasing offset. Once an error is hit or we get a fatal signal
- * while waiting, then return without waiting for any more replies.
+ * increasing offset. Once an error is hit, then return without waiting
+ * for any more replies.
*/
restart_loop:
- list_for_each_entry_safe(wdata, tmp, &wdata_list, list) {
+ list_for_each_entry_safe(wdata, tmp, &ctx->list, list) {
if (!rc) {
- /* FIXME: freezable too? */
- rc = wait_for_completion_killable(&wdata->done);
- if (rc)
- rc = -EINTR;
- else if (wdata->result)
+ if (!try_wait_for_completion(&wdata->done)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+
+ if (wdata->result)
rc = wdata->result;
else
- total_written += wdata->bytes;
+ ctx->total_len += wdata->bytes;
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
struct list_head tmp_list;
- struct iov_iter tmp_from = saved_from;
+ struct iov_iter tmp_from = ctx->iter;
INIT_LIST_HEAD(&tmp_list);
list_del_init(&wdata->list);
iov_iter_advance(&tmp_from,
- wdata->offset - iocb->ki_pos);
+ wdata->offset - ctx->pos);
rc = cifs_write_from_iter(wdata->offset,
wdata->bytes, &tmp_from,
- open_file, cifs_sb, &tmp_list);
+ ctx->cfile, cifs_sb, &tmp_list,
+ ctx);
- list_splice(&tmp_list, &wdata_list);
+ list_splice(&tmp_list, &ctx->list);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
@@ -2705,12 +2692,111 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
+ for (i = 0; i < ctx->npages; i++)
+ put_page(ctx->bv[i].bv_page);
+
+ cifs_stats_bytes_written(tcon, ctx->total_len);
+ set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
+
+ ctx->rc = (rc == 0) ? ctx->total_len : rc;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (ctx->iocb && ctx->iocb->ki_complete)
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ else
+ complete(&ctx->done);
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t total_written = 0;
+ struct cifsFileInfo *cfile;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_aio_ctx *ctx;
+ struct iov_iter saved_from = *from;
+ int rc;
+
+ /*
+ * BB - optimize the way when signing is disabled. We can drop this
+ * extra memory-to-memory copying and use iovec buffers for constructing
+ * write request.
+ */
+
+ rc = generic_write_checks(iocb, from);
+ if (rc <= 0)
+ return rc;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ if (!tcon->ses->server->ops->async_writev)
+ return -ENOSYS;
+
+ ctx = cifs_aio_ctx_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->cfile = cifsFileInfo_get(cfile);
+
+ if (!is_sync_kiocb(iocb))
+ ctx->iocb = iocb;
+
+ ctx->pos = iocb->ki_pos;
+
+ rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ /* grab a lock here due to read response handlers can access ctx */
+ mutex_lock(&ctx->aio_mutex);
+
+ rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from,
+ cfile, cifs_sb, &ctx->list, ctx);
+
+ /*
+ * If at least one write was successfully sent, then discard any rc
+ * value from the later writes. If the other write succeeds, then
+ * we'll end up returning whatever was written. If it fails, then
+ * we'll get a new rc value from that.
+ */
+ if (!list_empty(&ctx->list))
+ rc = 0;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ if (!is_sync_kiocb(iocb)) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EIOCBQUEUED;
+ }
+
+ rc = wait_for_completion_killable(&ctx->done);
+ if (rc) {
+ mutex_lock(&ctx->aio_mutex);
+ ctx->rc = rc = -EINTR;
+ total_written = ctx->total_len;
+ mutex_unlock(&ctx->aio_mutex);
+ } else {
+ rc = ctx->rc;
+ total_written = ctx->total_len;
+ }
+
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+
if (unlikely(!total_written))
return rc;
iocb->ki_pos += total_written;
- set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags);
- cifs_stats_bytes_written(tcon, total_written);
return total_written;
}
@@ -2859,6 +2945,7 @@ cifs_uncached_readdata_release(struct kref *refcount)
struct cifs_readdata, refcount);
unsigned int i;
+ kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
rdata->pages[i] = NULL;
@@ -2900,6 +2987,8 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
return remaining ? -EFAULT : 0;
}
+static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
+
static void
cifs_uncached_readv_complete(struct work_struct *work)
{
@@ -2907,6 +2996,8 @@ cifs_uncached_readv_complete(struct work_struct *work)
struct cifs_readdata, work);
complete(&rdata->done);
+ collect_uncached_read_data(rdata->ctx);
+ /* the below call can possibly free the last ref to aio ctx */
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
@@ -2973,7 +3064,8 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
static int
cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
+ struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
+ struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
unsigned int npages, rsize, credits;
@@ -3020,6 +3112,8 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
rdata->credits = credits;
+ rdata->ctx = ctx;
+ kref_get(&ctx->refcount);
if (!rdata->cfile->invalidHandle ||
!(rc = cifs_reopen_file(rdata->cfile, true)))
@@ -3042,50 +3136,37 @@ error:
return rc;
}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static void
+collect_uncached_read_data(struct cifs_aio_ctx *ctx)
{
- struct file *file = iocb->ki_filp;
- ssize_t rc;
- size_t len;
- ssize_t total_read = 0;
- loff_t offset = iocb->ki_pos;
+ struct cifs_readdata *rdata, *tmp;
+ struct iov_iter *to = &ctx->iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- struct cifsFileInfo *open_file;
- struct cifs_readdata *rdata, *tmp;
- struct list_head rdata_list;
-
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- INIT_LIST_HEAD(&rdata_list);
- cifs_sb = CIFS_FILE_SB(file);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
+ unsigned int i;
+ int rc;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
+ tcon = tlink_tcon(ctx->cfile->tlink);
+ cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb);
- rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
+ mutex_lock(&ctx->aio_mutex);
- /* if at least one read request send succeeded, then reset rc */
- if (!list_empty(&rdata_list))
- rc = 0;
+ if (list_empty(&ctx->list)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
- len = iov_iter_count(to);
+ rc = ctx->rc;
/* the loop below should proceed in the order of increasing offsets */
again:
- list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+ list_for_each_entry_safe(rdata, tmp, &ctx->list, list) {
if (!rc) {
- /* FIXME: freezable sleep too? */
- rc = wait_for_completion_killable(&rdata->done);
- if (rc)
- rc = -EINTR;
- else if (rdata->result == -EAGAIN) {
+ if (!try_wait_for_completion(&rdata->done)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+
+ if (rdata->result == -EAGAIN) {
/* resend call if it's a retryable error */
struct list_head tmp_list;
unsigned int got_bytes = rdata->got_bytes;
@@ -3111,9 +3192,9 @@ again:
rdata->offset + got_bytes,
rdata->bytes - got_bytes,
rdata->cfile, cifs_sb,
- &tmp_list);
+ &tmp_list, ctx);
- list_splice(&tmp_list, &rdata_list);
+ list_splice(&tmp_list, &ctx->list);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
@@ -3131,14 +3212,110 @@ again:
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- total_read = len - iov_iter_count(to);
+ for (i = 0; i < ctx->npages; i++) {
+ if (ctx->should_dirty)
+ set_page_dirty(ctx->bv[i].bv_page);
+ put_page(ctx->bv[i].bv_page);
+ }
+
+ ctx->total_len = ctx->len - iov_iter_count(to);
- cifs_stats_bytes_read(tcon, total_read);
+ cifs_stats_bytes_read(tcon, ctx->total_len);
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
+ ctx->rc = (rc == 0) ? ctx->total_len : rc;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (ctx->iocb && ctx->iocb->ki_complete)
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ else
+ complete(&ctx->done);
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t rc;
+ size_t len;
+ ssize_t total_read = 0;
+ loff_t offset = iocb->ki_pos;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
+ struct cifs_aio_ctx *ctx;
+
+ len = iov_iter_count(to);
+ if (!len)
+ return 0;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ if (!tcon->ses->server->ops->async_readv)
+ return -ENOSYS;
+
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+ ctx = cifs_aio_ctx_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->cfile = cifsFileInfo_get(cfile);
+
+ if (!is_sync_kiocb(iocb))
+ ctx->iocb = iocb;
+
+ if (to->type & ITER_IOVEC)
+ ctx->should_dirty = true;
+
+ rc = setup_aio_ctx_iter(ctx, to, READ);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ len = ctx->len;
+
+ /* grab a lock here due to read response handlers can access ctx */
+ mutex_lock(&ctx->aio_mutex);
+
+ rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx);
+
+ /* if at least one read request send succeeded, then reset rc */
+ if (!list_empty(&ctx->list))
+ rc = 0;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ if (!is_sync_kiocb(iocb)) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EIOCBQUEUED;
+ }
+
+ rc = wait_for_completion_killable(&ctx->done);
+ if (rc) {
+ mutex_lock(&ctx->aio_mutex);
+ ctx->rc = rc = -EINTR;
+ total_read = ctx->total_len;
+ mutex_unlock(&ctx->aio_mutex);
+ } else {
+ rc = ctx->rc;
+ total_read = ctx->total_len;
+ }
+
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+
if (total_read) {
iocb->ki_pos += total_read;
return total_read;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b261db34103c..c3b2fa0b2ec8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -322,9 +322,9 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
- fattr->cf_atime = CURRENT_TIME;
- fattr->cf_ctime = CURRENT_TIME;
- fattr->cf_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&fattr->cf_mtime);
+ fattr->cf_mtime = timespec_trunc(fattr->cf_mtime, sb->s_time_gran);
+ fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
fattr->cf_nlink = 2;
fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
}
@@ -586,9 +586,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- struct cifs_sb_info *cifs_sb, bool adjust_tz,
+ struct super_block *sb, bool adjust_tz,
bool symlink)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
memset(fattr, 0, sizeof(*fattr));
@@ -598,8 +599,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
if (info->LastAccessTime)
fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
- else
- fattr->cf_atime = CURRENT_TIME;
+ else {
+ ktime_get_real_ts(&fattr->cf_atime);
+ fattr->cf_atime = timespec_trunc(fattr->cf_atime, sb->s_time_gran);
+ }
fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -659,7 +662,6 @@ cifs_get_file_info(struct file *filp)
FILE_ALL_INFO find_data;
struct cifs_fattr fattr;
struct inode *inode = file_inode(filp);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -671,7 +673,7 @@ cifs_get_file_info(struct file *filp)
rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
switch (rc) {
case 0:
- cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+ cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false,
false);
break;
case -EREMOTE:
@@ -753,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
}
if (!rc) {
- cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+ cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
symlink);
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, sb);
@@ -1363,9 +1365,9 @@ out_reval:
cifs_inode = CIFS_I(inode);
cifs_inode->time = 0; /* will force revalidate to get info
when needed */
- inode->i_ctime = current_fs_time(sb);
+ inode->i_ctime = current_time(inode);
}
- dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -1633,7 +1635,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifsInode->time = 0;
d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
rmdir_exit:
kfree(full_path);
@@ -1806,7 +1808,7 @@ unlink_target:
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
- target_dir->i_mtime = current_fs_time(source_dir->i_sb);
+ target_dir->i_mtime = current_time(source_dir);
cifs_rename_exit:
kfree(info_buf_source);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 265c45fe4ea5..76fb0917dc8c 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
src_inode->i_size, 0);
-
+ if (rc > 0)
+ rc = 0;
out_fput:
fdput(src_file);
out_drop_write:
@@ -208,10 +209,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EOPNOTSUPP;
break;
case CIFS_IOC_GET_MNT_INFO:
+ if (pSMBFile == NULL)
+ break;
tcon = tlink_tcon(pSMBFile->tlink);
rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
break;
case CIFS_ENUMERATE_SNAPSHOTS:
+ if (pSMBFile == NULL)
+ break;
if (arg == 0) {
rc = -EINVAL;
goto cifs_ioc_exit;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d3fb11529ed9..b08531977daa 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -22,6 +22,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/mempool.h>
+#include <linux/vmalloc.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -167,13 +168,11 @@ cifs_buf_get(void)
/* clear the first few header bytes */
/* for most paths, more is cleared in header_assemble */
- if (ret_buf) {
- memset(ret_buf, 0, buf_size + 3);
- atomic_inc(&bufAllocCount);
+ memset(ret_buf, 0, buf_size + 3);
+ atomic_inc(&bufAllocCount);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totBufAllocCount);
+ atomic_inc(&totBufAllocCount);
#endif /* CONFIG_CIFS_STATS2 */
- }
return ret_buf;
}
@@ -201,15 +200,13 @@ cifs_small_buf_get(void)
albeit slightly larger than necessary and maxbuffersize
defaults to this and can not be bigger */
ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
- if (ret_buf) {
/* No need to clear memory here, cleared in header assemble */
/* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
- atomic_inc(&smBufAllocCount);
+ atomic_inc(&smBufAllocCount);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totSmBufAllocCount);
+ atomic_inc(&totSmBufAllocCount);
#endif /* CONFIG_CIFS_STATS2 */
- }
return ret_buf;
}
@@ -492,7 +489,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&pCifsInode->flags);
- queue_work(cifsiod_wq,
+ queue_work(cifsoplockd_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -745,3 +742,122 @@ parse_DFS_referrals_exit:
}
return rc;
}
+
+struct cifs_aio_ctx *
+cifs_aio_ctx_alloc(void)
+{
+ struct cifs_aio_ctx *ctx;
+
+ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ INIT_LIST_HEAD(&ctx->list);
+ mutex_init(&ctx->aio_mutex);
+ init_completion(&ctx->done);
+ kref_init(&ctx->refcount);
+ return ctx;
+}
+
+void
+cifs_aio_ctx_release(struct kref *refcount)
+{
+ struct cifs_aio_ctx *ctx = container_of(refcount,
+ struct cifs_aio_ctx, refcount);
+
+ cifsFileInfo_put(ctx->cfile);
+ kvfree(ctx->bv);
+ kfree(ctx);
+}
+
+#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024)
+
+int
+setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
+{
+ ssize_t rc;
+ unsigned int cur_npages;
+ unsigned int npages = 0;
+ unsigned int i;
+ size_t len;
+ size_t count = iov_iter_count(iter);
+ unsigned int saved_len;
+ size_t start;
+ unsigned int max_pages = iov_iter_npages(iter, INT_MAX);
+ struct page **pages = NULL;
+ struct bio_vec *bv = NULL;
+
+ if (iter->type & ITER_KVEC) {
+ memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
+ ctx->len = count;
+ iov_iter_advance(iter, count);
+ return 0;
+ }
+
+ if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT)
+ bv = kmalloc_array(max_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+
+ if (!bv) {
+ bv = vmalloc(max_pages * sizeof(struct bio_vec));
+ if (!bv)
+ return -ENOMEM;
+ }
+
+ if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT)
+ pages = kmalloc_array(max_pages, sizeof(struct page *),
+ GFP_KERNEL);
+
+ if (!pages) {
+ pages = vmalloc(max_pages * sizeof(struct page *));
+ if (!bv) {
+ kvfree(bv);
+ return -ENOMEM;
+ }
+ }
+
+ saved_len = count;
+
+ while (count && npages < max_pages) {
+ rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+ if (rc < 0) {
+ cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc);
+ break;
+ }
+
+ if (rc > count) {
+ cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc,
+ count);
+ break;
+ }
+
+ iov_iter_advance(iter, rc);
+ count -= rc;
+ rc += start;
+ cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
+
+ if (npages + cur_npages > max_pages) {
+ cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n",
+ npages + cur_npages, max_pages);
+ break;
+ }
+
+ for (i = 0; i < cur_npages; i++) {
+ len = rc > PAGE_SIZE ? PAGE_SIZE : rc;
+ bv[npages + i].bv_page = pages[i];
+ bv[npages + i].bv_offset = start;
+ bv[npages + i].bv_len = len - start;
+ rc -= len;
+ start = 0;
+ }
+
+ npages += cur_npages;
+ }
+
+ kvfree(pages);
+ ctx->bv = bv;
+ ctx->len = saved_len - count;
+ ctx->npages = npages;
+ iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len);
+ return 0;
+}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index abae6dd2c6b9..cc88f4f0325e 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -980,10 +980,10 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
days = sd->Day;
month = sd->Month;
- if ((days > 31) || (month > 12)) {
+ if (days < 1 || days > 31 || month < 1 || month > 12) {
cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
- if (month > 12)
- month = 12;
+ days = clamp(days, 1, 31);
+ month = clamp(month, 1, 12);
}
month -= 1;
days += total_days_of_prev_months[month];
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index cc93ba4da9b5..27bc360c7ffd 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1015,6 +1015,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile)
return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
}
+static bool
+cifs_can_echo(struct TCP_Server_Info *server)
+{
+ if (server->tcpStatus == CifsGood)
+ return true;
+
+ return false;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1049,6 +1058,7 @@ struct smb_version_operations smb1_operations = {
.get_dfs_refer = CIFSGetDFSRefer,
.qfs_tcon = cifs_qfs_tcon,
.is_path_accessible = cifs_is_path_accessible,
+ .can_echo = cifs_can_echo,
.query_path_info = cifs_query_path_info,
.query_file_info = cifs_query_file_info,
.get_srv_inum = cifs_get_srv_inum,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1a04b3a5beb1..7b08a1446a7f 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
else
cfile->oplock_break_cancelled = true;
- queue_work(cifsiod_wq, &cfile->oplock_break);
+ queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
}
@@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
spin_unlock(&cfile->file_info_lock);
- queue_work(cifsiod_wq, &cfile->oplock_break);
+ queue_work(cifsoplockd_wq,
+ &cfile->oplock_break);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 152e37f2ad92..c58691834eb2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
}
if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) {
rc = -ERANGE;
+ kfree(retbuf);
return rc;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 02da648041fc..48ff7703b919 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -33,6 +33,7 @@
#include <linux/vfs.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uaccess.h>
+#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
#include "smb2pdu.h"
@@ -632,8 +633,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
}
if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
- cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
- return -EIO;
+ cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n",
+ rsplen);
+
+ /* relax check since Mac returns max bufsize allowed on ioctl */
+ if (rsplen > CIFSMaxBufSize)
+ return -EIO;
}
/* check validate negotiate info response matches what we got earlier */
@@ -1853,8 +1858,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
* than one credit. Windows typically sets this smaller, but for some
* ioctls it may be useful to allow server to send more. No point
* limiting what the server can send as long as fits in one credit
+ * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE
+ * (by default, note that it can be overridden to make max larger)
+ * in responses (except for read responses which can be bigger.
+ * We may want to bump this limit up
*/
- req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+ req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize);
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 506b67fc93d9..c69ec96e92ac 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -538,23 +538,19 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
}
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
- if (temp == NULL)
- return temp;
- else {
- memset(temp, 0, sizeof(struct mid_q_entry));
- temp->mid = le64_to_cpu(shdr->MessageId);
- temp->pid = current->pid;
- temp->command = shdr->Command; /* Always LE */
- temp->when_alloc = jiffies;
- temp->server = server;
-
- /*
- * The default is for the mid to be synchronous, so the
- * default callback just wakes up the current task.
- */
- temp->callback = cifs_wake_up_task;
- temp->callback_data = current;
- }
+ memset(temp, 0, sizeof(struct mid_q_entry));
+ temp->mid = le64_to_cpu(shdr->MessageId);
+ temp->pid = current->pid;
+ temp->command = shdr->Command; /* Always LE */
+ temp->when_alloc = jiffies;
+ temp->server = server;
+
+ /*
+ * The default is for the mid to be synchronous, so the
+ * default callback just wakes up the current task.
+ */
+ temp->callback = cifs_wake_up_task;
+ temp->callback_data = current;
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f6e13a977fc8..4d64b5b8fc9c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -55,26 +55,22 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
}
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
- if (temp == NULL)
- return temp;
- else {
- memset(temp, 0, sizeof(struct mid_q_entry));
- temp->mid = get_mid(smb_buffer);
- temp->pid = current->pid;
- temp->command = cpu_to_le16(smb_buffer->Command);
- cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
+ memset(temp, 0, sizeof(struct mid_q_entry));
+ temp->mid = get_mid(smb_buffer);
+ temp->pid = current->pid;
+ temp->command = cpu_to_le16(smb_buffer->Command);
+ cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
/* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
- /* when mid allocated can be before when sent */
- temp->when_alloc = jiffies;
- temp->server = server;
+ /* when mid allocated can be before when sent */
+ temp->when_alloc = jiffies;
+ temp->server = server;
- /*
- * The default is for the mid to be synchronous, so the
- * default callback just wakes up the current task.
- */
- temp->callback = cifs_wake_up_task;
- temp->callback_data = current;
- }
+ /*
+ * The default is for the mid to be synchronous, so the
+ * default callback just wakes up the current task.
+ */
+ temp->callback = cifs_wake_up_task;
+ temp->callback_data = current;
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2dea594da199..6058df380cc0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
goto unlock_out;
}
- error = bdi_setup_and_register(&vc->bdi, "coda");
- if (error)
- goto unlock_out;
-
vc->vc_sb = sb;
mutex_unlock(&vc->vc_mutex);
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
sb->s_d_op = &coda_dentry_operations;
- sb->s_bdi = &vc->bdi;
+
+ error = super_setup_bdi(sb);
+ if (error)
+ goto error;
/* get root fid from Venus: this needs the root inode */
error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
error:
mutex_lock(&vc->vc_mutex);
- bdi_destroy(&vc->bdi);
vc->vc_sb = NULL;
sb->s_fs_info = NULL;
unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
{
struct venus_comm *vcp = coda_vcp(sb);
mutex_lock(&vcp->vc_mutex);
- bdi_destroy(&vcp->bdi);
vcp->vc_sb = NULL;
sb->s_fs_info = NULL;
mutex_unlock(&vcp->vc_mutex);
diff --git a/fs/compat.c b/fs/compat.c
index c61b506f5bc9..190b38b39d9e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -15,555 +15,14 @@
* published by the Free Software Foundation.
*/
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/linkage.h>
#include <linux/compat.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/fcntl.h>
-#include <linux/namei.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/vfs.h>
-#include <linux/ioctl.h>
-#include <linux/init.h>
#include <linux/ncp_mount.h>
#include <linux/nfs4_mount.h>
#include <linux/syscalls.h>
-#include <linux/ctype.h>
-#include <linux/dirent.h>
-#include <linux/fsnotify.h>
-#include <linux/highuid.h>
-#include <linux/personality.h>
-#include <linux/rwsem.h>
-#include <linux/tsacct_kern.h>
-#include <linux/security.h>
-#include <linux/highmem.h>
-#include <linux/signal.h>
-#include <linux/poll.h>
-#include <linux/mm.h>
-#include <linux/fs_struct.h>
#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/aio.h>
-
#include <linux/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/ioctls.h>
#include "internal.h"
-/*
- * Not all architectures have sys_utime, so implement this in terms
- * of sys_utimes.
- */
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
- struct compat_utimbuf __user *, t)
-{
- struct timespec tv[2];
-
- if (t) {
- if (get_user(tv[0].tv_sec, &t->actime) ||
- get_user(tv[1].tv_sec, &t->modtime))
- return -EFAULT;
- tv[0].tv_nsec = 0;
- tv[1].tv_nsec = 0;
- }
- return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
-}
-
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
-{
- struct timespec tv[2];
-
- if (t) {
- if (compat_get_timespec(&tv[0], &t[0]) ||
- compat_get_timespec(&tv[1], &t[1]))
- return -EFAULT;
-
- if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
- return 0;
- }
- return do_utimes(dfd, filename, t ? tv : NULL, flags);
-}
-
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
-{
- struct timespec tv[2];
-
- if (t) {
- if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
- get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
- get_user(tv[1].tv_sec, &t[1].tv_sec) ||
- get_user(tv[1].tv_nsec, &t[1].tv_usec))
- return -EFAULT;
- if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
- tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
- return -EINVAL;
- tv[0].tv_nsec *= 1000;
- tv[1].tv_nsec *= 1000;
- }
- return do_utimes(dfd, filename, t ? tv : NULL, 0);
-}
-
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
-{
- return compat_sys_futimesat(AT_FDCWD, filename, t);
-}
-
-static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
-{
- struct compat_stat tmp;
-
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
- return -EOVERFLOW;
-
- memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
- tmp.st_ino = stat->ino;
- if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
- return -EOVERFLOW;
- tmp.st_mode = stat->mode;
- tmp.st_nlink = stat->nlink;
- if (tmp.st_nlink != stat->nlink)
- return -EOVERFLOW;
- SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
- SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
- if ((u64) stat->size > MAX_NON_LFS)
- return -EOVERFLOW;
- tmp.st_size = stat->size;
- tmp.st_atime = stat->atime.tv_sec;
- tmp.st_atime_nsec = stat->atime.tv_nsec;
- tmp.st_mtime = stat->mtime.tv_sec;
- tmp.st_mtime_nsec = stat->mtime.tv_nsec;
- tmp.st_ctime = stat->ctime.tv_sec;
- tmp.st_ctime_nsec = stat->ctime.tv_nsec;
- tmp.st_blocks = stat->blocks;
- tmp.st_blksize = stat->blksize;
- return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
-}
-
-COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
- struct compat_stat __user *, statbuf)
-{
- struct kstat stat;
- int error;
-
- error = vfs_stat(filename, &stat);
- if (error)
- return error;
- return cp_compat_stat(&stat, statbuf);
-}
-
-COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
- struct compat_stat __user *, statbuf)
-{
- struct kstat stat;
- int error;
-
- error = vfs_lstat(filename, &stat);
- if (error)
- return error;
- return cp_compat_stat(&stat, statbuf);
-}
-
-#ifndef __ARCH_WANT_STAT64
-COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
- const char __user *, filename,
- struct compat_stat __user *, statbuf, int, flag)
-{
- struct kstat stat;
- int error;
-
- error = vfs_fstatat(dfd, filename, &stat, flag);
- if (error)
- return error;
- return cp_compat_stat(&stat, statbuf);
-}
-#endif
-
-COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
- struct compat_stat __user *, statbuf)
-{
- struct kstat stat;
- int error = vfs_fstat(fd, &stat);
-
- if (!error)
- error = cp_compat_stat(&stat, statbuf);
- return error;
-}
-
-static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
-{
-
- if (sizeof ubuf->f_blocks == 4) {
- if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
- kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
- __put_user(kbuf->f_type, &ubuf->f_type) ||
- __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
- __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
- __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
- __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
- __put_user(kbuf->f_files, &ubuf->f_files) ||
- __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
- __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
- __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
- __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(kbuf->f_flags, &ubuf->f_flags) ||
- __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
- return -EFAULT;
- return 0;
-}
-
-/*
- * The following statfs calls are copies of code from fs/statfs.c and
- * should be checked against those from time to time
- */
-COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
-{
- struct kstatfs tmp;
- int error = user_statfs(pathname, &tmp);
- if (!error)
- error = put_compat_statfs(buf, &tmp);
- return error;
-}
-
-COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
-{
- struct kstatfs tmp;
- int error = fd_statfs(fd, &tmp);
- if (!error)
- error = put_compat_statfs(buf, &tmp);
- return error;
-}
-
-static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
-{
- if (sizeof(ubuf->f_bsize) == 4) {
- if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
- kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
- __put_user(kbuf->f_type, &ubuf->f_type) ||
- __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
- __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
- __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
- __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
- __put_user(kbuf->f_files, &ubuf->f_files) ||
- __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
- __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
- __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
- __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(kbuf->f_flags, &ubuf->f_flags) ||
- __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
- return -EFAULT;
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
-{
- struct kstatfs tmp;
- int error;
-
- if (sz != sizeof(*buf))
- return -EINVAL;
-
- error = user_statfs(pathname, &tmp);
- if (!error)
- error = put_compat_statfs64(buf, &tmp);
- return error;
-}
-
-COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
-{
- struct kstatfs tmp;
- int error;
-
- if (sz != sizeof(*buf))
- return -EINVAL;
-
- error = fd_statfs(fd, &tmp);
- if (!error)
- error = put_compat_statfs64(buf, &tmp);
- return error;
-}
-
-/*
- * This is a copy of sys_ustat, just dealing with a structure layout.
- * Given how simple this syscall is that apporach is more maintainable
- * than the various conversion hacks.
- */
-COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
-{
- struct compat_ustat tmp;
- struct kstatfs sbuf;
- int err = vfs_ustat(new_decode_dev(dev), &sbuf);
- if (err)
- return err;
-
- memset(&tmp, 0, sizeof(struct compat_ustat));
- tmp.f_tfree = sbuf.f_bfree;
- tmp.f_tinode = sbuf.f_ffree;
- if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
- return -EFAULT;
- return 0;
-}
-
-static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
- if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
- __get_user(kfl->l_type, &ufl->l_type) ||
- __get_user(kfl->l_whence, &ufl->l_whence) ||
- __get_user(kfl->l_start, &ufl->l_start) ||
- __get_user(kfl->l_len, &ufl->l_len) ||
- __get_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-
-static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
- if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
- __put_user(kfl->l_type, &ufl->l_type) ||
- __put_user(kfl->l_whence, &ufl->l_whence) ||
- __put_user(kfl->l_start, &ufl->l_start) ||
- __put_user(kfl->l_len, &ufl->l_len) ||
- __put_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-
-#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
-static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
-{
- if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
- __get_user(kfl->l_type, &ufl->l_type) ||
- __get_user(kfl->l_whence, &ufl->l_whence) ||
- __get_user(kfl->l_start, &ufl->l_start) ||
- __get_user(kfl->l_len, &ufl->l_len) ||
- __get_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-#endif
-
-#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
-static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
-{
- if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
- __put_user(kfl->l_type, &ufl->l_type) ||
- __put_user(kfl->l_whence, &ufl->l_whence) ||
- __put_user(kfl->l_start, &ufl->l_start) ||
- __put_user(kfl->l_len, &ufl->l_len) ||
- __put_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-#endif
-
-static unsigned int
-convert_fcntl_cmd(unsigned int cmd)
-{
- switch (cmd) {
- case F_GETLK64:
- return F_GETLK;
- case F_SETLK64:
- return F_SETLK;
- case F_SETLKW64:
- return F_SETLKW;
- }
-
- return cmd;
-}
-
-COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
- compat_ulong_t, arg)
-{
- mm_segment_t old_fs;
- struct flock f;
- long ret;
- unsigned int conv_cmd;
-
- switch (cmd) {
- case F_GETLK:
- case F_SETLK:
- case F_SETLKW:
- ret = get_compat_flock(&f, compat_ptr(arg));
- if (ret != 0)
- break;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_fcntl(fd, cmd, (unsigned long)&f);
- set_fs(old_fs);
- if (cmd == F_GETLK && ret == 0) {
- /* GETLK was successful and we need to return the data...
- * but it needs to fit in the compat structure.
- * l_start shouldn't be too big, unless the original
- * start + end is greater than COMPAT_OFF_T_MAX, in which
- * case the app was asking for trouble, so we return
- * -EOVERFLOW in that case.
- * l_len could be too big, in which case we just truncate it,
- * and only allow the app to see that part of the conflicting
- * lock that might make sense to it anyway
- */
-
- if (f.l_start > COMPAT_OFF_T_MAX)
- ret = -EOVERFLOW;
- if (f.l_len > COMPAT_OFF_T_MAX)
- f.l_len = COMPAT_OFF_T_MAX;
- if (ret == 0)
- ret = put_compat_flock(&f, compat_ptr(arg));
- }
- break;
-
- case F_GETLK64:
- case F_SETLK64:
- case F_SETLKW64:
- case F_OFD_GETLK:
- case F_OFD_SETLK:
- case F_OFD_SETLKW:
- ret = get_compat_flock64(&f, compat_ptr(arg));
- if (ret != 0)
- break;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- conv_cmd = convert_fcntl_cmd(cmd);
- ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
- set_fs(old_fs);
- if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
- /* need to return lock information - see above for commentary */
- if (f.l_start > COMPAT_LOFF_T_MAX)
- ret = -EOVERFLOW;
- if (f.l_len > COMPAT_LOFF_T_MAX)
- f.l_len = COMPAT_LOFF_T_MAX;
- if (ret == 0)
- ret = put_compat_flock64(&f, compat_ptr(arg));
- }
- break;
-
- default:
- ret = sys_fcntl(fd, cmd, arg);
- break;
- }
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
- compat_ulong_t, arg)
-{
- switch (cmd) {
- case F_GETLK64:
- case F_SETLK64:
- case F_SETLKW64:
- case F_OFD_GETLK:
- case F_OFD_SETLK:
- case F_OFD_SETLKW:
- return -EINVAL;
- }
- return compat_sys_fcntl64(fd, cmd, arg);
-}
-
-/* A write operation does a read from user space and vice versa */
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
-
-ssize_t compat_rw_copy_check_uvector(int type,
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
- unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- compat_ssize_t tot_len;
- struct iovec *iov = *ret_pointer = fast_pointer;
- ssize_t ret = 0;
- int seg;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0)
- goto out;
-
- ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV)
- goto out;
- if (nr_segs > fast_segs) {
- ret = -ENOMEM;
- iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL)
- goto out;
- }
- *ret_pointer = iov;
-
- ret = -EFAULT;
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t.
- *
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
- * no overflow possibility.
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- compat_uptr_t buf;
- compat_ssize_t len;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
- goto out;
- if (type >= 0 &&
- !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - tot_len)
- len = MAX_RW_COUNT - tot_len;
- tot_len += len;
- iov->iov_base = compat_ptr(buf);
- iov->iov_len = (compat_size_t) len;
- uvector++;
- iov++;
- }
- ret = tot_len;
-
-out:
- return ret;
-}
-
struct compat_ncp_mount_data {
compat_int_t version;
compat_uint_t ncp_fd;
@@ -744,653 +203,3 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
out:
return retval;
}
-
-struct compat_old_linux_dirent {
- compat_ulong_t d_ino;
- compat_ulong_t d_offset;
- unsigned short d_namlen;
- char d_name[1];
-};
-
-struct compat_readdir_callback {
- struct dir_context ctx;
- struct compat_old_linux_dirent __user *dirent;
- int result;
-};
-
-static int compat_fillonedir(struct dir_context *ctx, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int d_type)
-{
- struct compat_readdir_callback *buf =
- container_of(ctx, struct compat_readdir_callback, ctx);
- struct compat_old_linux_dirent __user *dirent;
- compat_ulong_t d_ino;
-
- if (buf->result)
- return -EINVAL;
- d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
- buf->result = -EOVERFLOW;
- return -EOVERFLOW;
- }
- buf->result++;
- dirent = buf->dirent;
- if (!access_ok(VERIFY_WRITE, dirent,
- (unsigned long)(dirent->d_name + namlen + 1) -
- (unsigned long)dirent))
- goto efault;
- if ( __put_user(d_ino, &dirent->d_ino) ||
- __put_user(offset, &dirent->d_offset) ||
- __put_user(namlen, &dirent->d_namlen) ||
- __copy_to_user(dirent->d_name, name, namlen) ||
- __put_user(0, dirent->d_name + namlen))
- goto efault;
- return 0;
-efault:
- buf->result = -EFAULT;
- return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
- struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
-{
- int error;
- struct fd f = fdget_pos(fd);
- struct compat_readdir_callback buf = {
- .ctx.actor = compat_fillonedir,
- .dirent = dirent
- };
-
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
- if (buf.result)
- error = buf.result;
-
- fdput_pos(f);
- return error;
-}
-
-struct compat_linux_dirent {
- compat_ulong_t d_ino;
- compat_ulong_t d_off;
- unsigned short d_reclen;
- char d_name[1];
-};
-
-struct compat_getdents_callback {
- struct dir_context ctx;
- struct compat_linux_dirent __user *current_dir;
- struct compat_linux_dirent __user *previous;
- int count;
- int error;
-};
-
-static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type)
-{
- struct compat_linux_dirent __user * dirent;
- struct compat_getdents_callback *buf =
- container_of(ctx, struct compat_getdents_callback, ctx);
- compat_ulong_t d_ino;
- int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
- namlen + 2, sizeof(compat_long_t));
-
- buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
- return -EINVAL;
- d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
- buf->error = -EOVERFLOW;
- return -EOVERFLOW;
- }
- dirent = buf->previous;
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user(d_ino, &dirent->d_ino))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
- goto efault;
- if (__put_user(d_type, (char __user *) dirent + reclen - 1))
- goto efault;
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
- buf->count -= reclen;
- return 0;
-efault:
- buf->error = -EFAULT;
- return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
- struct compat_linux_dirent __user *, dirent, unsigned int, count)
-{
- struct fd f;
- struct compat_linux_dirent __user * lastdirent;
- struct compat_getdents_callback buf = {
- .ctx.actor = compat_filldir,
- .current_dir = dirent,
- .count = count
- };
- int error;
-
- if (!access_ok(VERIFY_WRITE, dirent, count))
- return -EFAULT;
-
- f = fdget_pos(fd);
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
- if (error >= 0)
- error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
- if (put_user(buf.ctx.pos, &lastdirent->d_off))
- error = -EFAULT;
- else
- error = count - buf.count;
- }
- fdput_pos(f);
- return error;
-}
-
-#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
-
-struct compat_getdents_callback64 {
- struct dir_context ctx;
- struct linux_dirent64 __user *current_dir;
- struct linux_dirent64 __user *previous;
- int count;
- int error;
-};
-
-static int compat_filldir64(struct dir_context *ctx, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int d_type)
-{
- struct linux_dirent64 __user *dirent;
- struct compat_getdents_callback64 *buf =
- container_of(ctx, struct compat_getdents_callback64, ctx);
- int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
- sizeof(u64));
- u64 off;
-
- buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
- return -EINVAL;
- dirent = buf->previous;
-
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user_unaligned(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user_unaligned(ino, &dirent->d_ino))
- goto efault;
- off = 0;
- if (__put_user_unaligned(off, &dirent->d_off))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (__put_user(d_type, &dirent->d_type))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
- goto efault;
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
- buf->count -= reclen;
- return 0;
-efault:
- buf->error = -EFAULT;
- return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
- struct linux_dirent64 __user *, dirent, unsigned int, count)
-{
- struct fd f;
- struct linux_dirent64 __user * lastdirent;
- struct compat_getdents_callback64 buf = {
- .ctx.actor = compat_filldir64,
- .current_dir = dirent,
- .count = count
- };
- int error;
-
- if (!access_ok(VERIFY_WRITE, dirent, count))
- return -EFAULT;
-
- f = fdget_pos(fd);
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
- if (error >= 0)
- error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
- typeof(lastdirent->d_off) d_off = buf.ctx.pos;
- if (__put_user_unaligned(d_off, &lastdirent->d_off))
- error = -EFAULT;
- else
- error = count - buf.count;
- }
- fdput_pos(f);
- return error;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
-
-/*
- * Exactly like fs/open.c:sys_open(), except that it doesn't set the
- * O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
-{
- return do_sys_open(AT_FDCWD, filename, flags, mode);
-}
-
-/*
- * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
- * O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
-{
- return do_sys_open(dfd, filename, flags, mode);
-}
-
-#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
-
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
- int timeval, int ret)
-{
- struct timespec ts;
-
- if (!p)
- return ret;
-
- if (current->personality & STICKY_TIMEOUTS)
- goto sticky;
-
- /* No update for zero timeout */
- if (!end_time->tv_sec && !end_time->tv_nsec)
- return ret;
-
- ktime_get_ts(&ts);
- ts = timespec_sub(*end_time, ts);
- if (ts.tv_sec < 0)
- ts.tv_sec = ts.tv_nsec = 0;
-
- if (timeval) {
- struct compat_timeval rtv;
-
- rtv.tv_sec = ts.tv_sec;
- rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
-
- if (!copy_to_user(p, &rtv, sizeof(rtv)))
- return ret;
- } else {
- struct compat_timespec rts;
-
- rts.tv_sec = ts.tv_sec;
- rts.tv_nsec = ts.tv_nsec;
-
- if (!copy_to_user(p, &rts, sizeof(rts)))
- return ret;
- }
- /*
- * If an application puts its timeval in read-only memory, we
- * don't want the Linux-specific update to the timeval to
- * cause a fault after the select has completed
- * successfully. However, because we're not updating the
- * timeval, we can't restart the system call.
- */
-
-sticky:
- if (ret == -ERESTARTNOHAND)
- ret = -EINTR;
- return ret;
-}
-
-/*
- * Ooo, nasty. We need here to frob 32-bit unsigned longs to
- * 64-bit unsigned longs.
- */
-static
-int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
- unsigned long *fdset)
-{
- nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
- if (ufdset) {
- unsigned long odd;
-
- if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
- return -EFAULT;
-
- odd = nr & 1UL;
- nr &= ~1UL;
- while (nr) {
- unsigned long h, l;
- if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
- return -EFAULT;
- ufdset += 2;
- *fdset++ = h << 32 | l;
- nr -= 2;
- }
- if (odd && __get_user(*fdset, ufdset))
- return -EFAULT;
- } else {
- /* Tricky, must clear full unsigned long in the
- * kernel fdset at the end, this makes sure that
- * actually happens.
- */
- memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
- }
- return 0;
-}
-
-static
-int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
- unsigned long *fdset)
-{
- unsigned long odd;
- nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-
- if (!ufdset)
- return 0;
-
- odd = nr & 1UL;
- nr &= ~1UL;
- while (nr) {
- unsigned long h, l;
- l = *fdset++;
- h = l >> 32;
- if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
- return -EFAULT;
- ufdset += 2;
- nr -= 2;
- }
- if (odd && __put_user(*fdset, ufdset))
- return -EFAULT;
- return 0;
-}
-
-
-/*
- * This is a virtual copy of sys_select from fs/select.c and probably
- * should be compared to it from time to time
- */
-
-/*
- * We can actually return ERESTARTSYS instead of EINTR, but I'd
- * like to be certain this leads to no problems. So I return
- * EINTR just for safety.
- *
- * Update: ERESTARTSYS breaks at least the xview clock binary, so
- * I'm trying ERESTARTNOHAND which restart only when you want to.
- */
-int compat_core_sys_select(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct timespec *end_time)
-{
- fd_set_bits fds;
- void *bits;
- int size, max_fds, ret = -EINVAL;
- struct fdtable *fdt;
- long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
-
- if (n < 0)
- goto out_nofds;
-
- /* max_fds can increase, so grab it once to avoid race */
- rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
- rcu_read_unlock();
- if (n > max_fds)
- n = max_fds;
-
- /*
- * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
- * since we used fdset we need to allocate memory in units of
- * long-words.
- */
- size = FDS_BYTES(n);
- bits = stack_fds;
- if (size > sizeof(stack_fds) / 6) {
- bits = kmalloc(6 * size, GFP_KERNEL);
- ret = -ENOMEM;
- if (!bits)
- goto out_nofds;
- }
- fds.in = (unsigned long *) bits;
- fds.out = (unsigned long *) (bits + size);
- fds.ex = (unsigned long *) (bits + 2*size);
- fds.res_in = (unsigned long *) (bits + 3*size);
- fds.res_out = (unsigned long *) (bits + 4*size);
- fds.res_ex = (unsigned long *) (bits + 5*size);
-
- if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
- (ret = compat_get_fd_set(n, outp, fds.out)) ||
- (ret = compat_get_fd_set(n, exp, fds.ex)))
- goto out;
- zero_fd_set(n, fds.res_in);
- zero_fd_set(n, fds.res_out);
- zero_fd_set(n, fds.res_ex);
-
- ret = do_select(n, &fds, end_time);
-
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = -ERESTARTNOHAND;
- if (signal_pending(current))
- goto out;
- ret = 0;
- }
-
- if (compat_set_fd_set(n, inp, fds.res_in) ||
- compat_set_fd_set(n, outp, fds.res_out) ||
- compat_set_fd_set(n, exp, fds.res_ex))
- ret = -EFAULT;
-out:
- if (bits != stack_fds)
- kfree(bits);
-out_nofds:
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
- compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
- struct compat_timeval __user *, tvp)
-{
- struct timespec end_time, *to = NULL;
- struct compat_timeval tv;
- int ret;
-
- if (tvp) {
- if (copy_from_user(&tv, tvp, sizeof(tv)))
- return -EFAULT;
-
- to = &end_time;
- if (poll_select_set_timeout(to,
- tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
- (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
- return -EINVAL;
- }
-
- ret = compat_core_sys_select(n, inp, outp, exp, to);
- ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
-
- return ret;
-}
-
-struct compat_sel_arg_struct {
- compat_ulong_t n;
- compat_uptr_t inp;
- compat_uptr_t outp;
- compat_uptr_t exp;
- compat_uptr_t tvp;
-};
-
-COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
-{
- struct compat_sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
- compat_ptr(a.exp), compat_ptr(a.tvp));
-}
-
-static long do_compat_pselect(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
-{
- compat_sigset_t ss32;
- sigset_t ksigmask, sigsaved;
- struct compat_timespec ts;
- struct timespec end_time, *to = NULL;
- int ret;
-
- if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
- return -EFAULT;
-
- to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
- return -EINVAL;
- }
-
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
- return -EFAULT;
- sigset_from_compat(&ksigmask, &ss32);
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
-
- ret = compat_core_sys_select(n, inp, outp, exp, to);
- ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
- if (ret == -ERESTARTNOHAND) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
- compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
- struct compat_timespec __user *, tsp, void __user *, sig)
-{
- compat_size_t sigsetsize = 0;
- compat_uptr_t up = 0;
-
- if (sig) {
- if (!access_ok(VERIFY_READ, sig,
- sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
- __get_user(up, (compat_uptr_t __user *)sig) ||
- __get_user(sigsetsize,
- (compat_size_t __user *)(sig+sizeof(up))))
- return -EFAULT;
- }
- return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
- sigsetsize);
-}
-
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
- unsigned int, nfds, struct compat_timespec __user *, tsp,
- const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
-{
- compat_sigset_t ss32;
- sigset_t ksigmask, sigsaved;
- struct compat_timespec ts;
- struct timespec end_time, *to = NULL;
- int ret;
-
- if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
- return -EFAULT;
-
- to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
- return -EINVAL;
- }
-
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
- return -EFAULT;
- sigset_from_compat(&ksigmask, &ss32);
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
-
- ret = do_sys_poll(ufds, nfds, to);
-
- /* We can restart this syscall, usually */
- if (ret == -EINTR) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
- ret = -ERESTARTNOHAND;
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
- return ret;
-}
-
-#ifdef CONFIG_FHANDLE
-/*
- * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
- * doesn't set the O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
- struct file_handle __user *, handle, int, flags)
-{
- return do_handle_open(mountdirfd, handle, flags);
-}
-#endif
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 11d087b2b28e..6116d5275a3e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -833,7 +833,7 @@ static int compat_ioctl_preallocate(struct file *file,
*/
#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
-#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
+#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
/* ioctl should not be warned about even if it's not implemented.
Valid reasons to use this:
- It is implemented with ->compat_ioctl on some device, but programs
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 37b49894c762..d1bb02b1ee58 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode,
static const char *lookup_table =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+
/**
* digest_encode() -
*
@@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
int fscrypt_fname_alloc_buffer(const struct inode *inode,
u32 ilen, struct fscrypt_str *crypto_str)
{
- unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
+ u32 olen = fscrypt_fname_encrypted_size(inode, ilen);
+ const u32 max_encoded_len =
+ max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
crypto_str->len = olen;
- if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
- olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+ olen = max(olen, max_encoded_len);
+
/*
* Allocated buffer can hold one more character to null-terminate the
* string
@@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
*
* The caller must have allocated sufficient memory for the @oname string.
*
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation. Short names are directly base64-encoded, while long
+ * names are encoded in fscrypt_digested_name format.
+ *
* Return: 0 on success, -errno on failure
*/
int fscrypt_fname_disk_to_usr(struct inode *inode,
@@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
struct fscrypt_str *oname)
{
const struct qstr qname = FSTR_TO_QSTR(iname);
- char buf[24];
+ struct fscrypt_digested_name digested_name;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
@@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
if (inode->i_crypt_info)
return fname_decrypt(inode, iname, oname);
- if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
+ if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
oname->len = digest_encode(iname->name, iname->len,
oname->name);
return 0;
}
if (hash) {
- memcpy(buf, &hash, 4);
- memcpy(buf + 4, &minor_hash, 4);
+ digested_name.hash = hash;
+ digested_name.minor_hash = minor_hash;
} else {
- memset(buf, 0, 8);
+ digested_name.hash = 0;
+ digested_name.minor_hash = 0;
}
- memcpy(buf + 8, iname->name + iname->len - 16, 16);
+ memcpy(digested_name.digest,
+ FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
+ FSCRYPT_FNAME_DIGEST_SIZE);
oname->name[0] = '_';
- oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+ oname->len = 1 + digest_encode((const char *)&digested_name,
+ sizeof(digested_name), oname->name + 1);
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
}
EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
+/**
+ * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ * proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ *
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname. Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * fscrypt_digested_name (for long names). Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, fscrypt_free_filename() must be called later to clean up.
+ *
+ * Return: 0 on success, -errno on failure
+ */
int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
int lookup, struct fscrypt_name *fname)
{
- int ret = 0, bigname = 0;
+ int ret;
+ int digested;
memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
@@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
* We don't have the key and we are doing a lookup; decode the
* user-supplied name
*/
- if (iname->name[0] == '_')
- bigname = 1;
- if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
- return -ENOENT;
+ if (iname->name[0] == '_') {
+ if (iname->len !=
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
+ return -ENOENT;
+ digested = 1;
+ } else {
+ if (iname->len >
+ BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+ return -ENOENT;
+ digested = 0;
+ }
- fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+ fname->crypto_buf.name =
+ kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+ sizeof(struct fscrypt_digested_name)),
+ GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = digest_decode(iname->name + bigname, iname->len - bigname,
+ ret = digest_decode(iname->name + digested, iname->len - digested,
fname->crypto_buf.name);
if (ret < 0) {
ret = -ENOENT;
goto errout;
}
fname->crypto_buf.len = ret;
- if (bigname) {
- memcpy(&fname->hash, fname->crypto_buf.name, 4);
- memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
+ if (digested) {
+ const struct fscrypt_digested_name *n =
+ (const void *)fname->crypto_buf.name;
+ fname->hash = n->hash;
+ fname->minor_hash = n->minor_hash;
} else {
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index e39696e64494..1e1f8a361b75 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -13,8 +13,6 @@
#include <linux/fscrypt_supp.h>
-#define FS_FNAME_CRYPTO_DIGEST_SIZE 32
-
/* Encryption parameters */
#define FS_XTS_TWEAK_SIZE 16
#define FS_AES_128_ECB_KEY_SIZE 16
@@ -22,10 +20,6 @@
#define FS_AES_256_CBC_KEY_SIZE 32
#define FS_AES_256_CTS_KEY_SIZE 32
#define FS_AES_256_XTS_KEY_SIZE 64
-#define FS_MAX_KEY_SIZE 64
-
-#define FS_KEY_DESC_PREFIX "fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE 8
#define FS_KEY_DERIVATION_NONCE_SIZE 16
@@ -51,13 +45,6 @@ struct fscrypt_context {
#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
-/* This is passed in from userspace into the kernel keyring */
-struct fscrypt_key {
- u32 mode;
- u8 raw[FS_MAX_KEY_SIZE];
- u32 size;
-} __packed;
-
/*
* A pointer to this structure is stored in the file system's in-core
* representation of an inode.
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 8cdfddce2b34..179e578b875b 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (res)
return res;
- if (!inode->i_sb->s_cop->get_context)
- return -EOPNOTSUPP;
-
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
if (res < 0) {
if (!fscrypt_dummy_context_enabled(inode) ||
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4908906d54d5..210976e7a269 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode,
{
struct fscrypt_context ctx;
- if (!inode->i_sb->s_cop->set_context)
- return -EOPNOTSUPP;
-
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
@@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
if (ret == -ENODATA) {
if (!S_ISDIR(inode->i_mode))
ret = -ENOTDIR;
- else if (!inode->i_sb->s_cop->empty_dir)
- ret = -EOPNOTSUPP;
else if (!inode->i_sb->s_cop->empty_dir(inode))
ret = -ENOTEMPTY;
else
@@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
struct fscrypt_policy policy;
int res;
- if (!inode->i_sb->s_cop->get_context ||
- !inode->i_sb->s_cop->is_encrypted(inode))
+ if (!inode->i_sb->s_cop->is_encrypted(inode))
return -ENODATA;
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
@@ -143,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
}
EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
+/**
+ * fscrypt_has_permitted_context() - is a file's encryption policy permitted
+ * within its directory?
+ *
+ * @parent: inode for parent directory
+ * @child: inode for file being looked up, opened, or linked into @parent
+ *
+ * Filesystems must call this before permitting access to an inode in a
+ * situation where the parent directory is encrypted (either before allowing
+ * ->lookup() to succeed, or for a regular file before allowing it to be opened)
+ * and before any operation that involves linking an inode into an encrypted
+ * directory, including link, rename, and cross rename. It enforces the
+ * constraint that within a given encrypted directory tree, all files use the
+ * same encryption policy. The pre-access check is needed to detect potentially
+ * malicious offline violations of this constraint, while the link and rename
+ * checks are needed to prevent online violations of this constraint.
+ *
+ * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
+ * the filesystem operation with EPERM.
+ */
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
- struct fscrypt_info *parent_ci, *child_ci;
+ const struct fscrypt_operations *cops = parent->i_sb->s_cop;
+ const struct fscrypt_info *parent_ci, *child_ci;
+ struct fscrypt_context parent_ctx, child_ctx;
int res;
- if ((parent == NULL) || (child == NULL)) {
- printk(KERN_ERR "parent %p child %p\n", parent, child);
- BUG_ON(1);
- }
-
/* No restrictions on file types which are never encrypted */
if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
!S_ISLNK(child->i_mode))
return 1;
- /* no restrictions if the parent directory is not encrypted */
- if (!parent->i_sb->s_cop->is_encrypted(parent))
+ /* No restrictions if the parent directory is unencrypted */
+ if (!cops->is_encrypted(parent))
return 1;
- /* if the child directory is not encrypted, this is always a problem */
- if (!parent->i_sb->s_cop->is_encrypted(child))
+
+ /* Encrypted directories must not contain unencrypted files */
+ if (!cops->is_encrypted(child))
return 0;
+
+ /*
+ * Both parent and child are encrypted, so verify they use the same
+ * encryption policy. Compare the fscrypt_info structs if the keys are
+ * available, otherwise retrieve and compare the fscrypt_contexts.
+ *
+ * Note that the fscrypt_context retrieval will be required frequently
+ * when accessing an encrypted directory tree without the key.
+ * Performance-wise this is not a big deal because we already don't
+ * really optimize for file access without the key (to the extent that
+ * such access is even possible), given that any attempted access
+ * already causes a fscrypt_context retrieval and keyring search.
+ *
+ * In any case, if an unexpected error occurs, fall back to "forbidden".
+ */
+
res = fscrypt_get_encryption_info(parent);
if (res)
return 0;
@@ -172,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
return 0;
parent_ci = parent->i_crypt_info;
child_ci = child->i_crypt_info;
- if (!parent_ci && !child_ci)
- return 1;
- if (!parent_ci || !child_ci)
+
+ if (parent_ci && child_ci) {
+ return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode ==
+ child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags);
+ }
+
+ res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx));
+ if (res != sizeof(parent_ctx))
return 0;
- return (memcmp(parent_ci->ci_master_key,
- child_ci->ci_master_key,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
- (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
- (parent_ci->ci_flags == child_ci->ci_flags));
+ res = cops->get_context(child, &child_ctx, sizeof(child_ctx));
+ if (res != sizeof(child_ctx))
+ return 0;
+
+ return memcmp(parent_ctx.master_key_descriptor,
+ child_ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ctx.contents_encryption_mode ==
+ child_ctx.contents_encryption_mode) &&
+ (parent_ctx.filenames_encryption_mode ==
+ child_ctx.filenames_encryption_mode) &&
+ (parent_ctx.flags == child_ctx.flags);
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
@@ -202,9 +245,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
struct fscrypt_info *ci;
int res;
- if (!parent->i_sb->s_cop->set_context)
- return -EOPNOTSUPP;
-
res = fscrypt_get_encryption_info(parent);
if (res < 0)
return res;
diff --git a/fs/dax.c b/fs/dax.c
index 85abd741253d..66d79067eedf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
- struct request_queue *q = bdev->bd_queue;
- long rc = -EIO;
-
- dax->addr = ERR_PTR(-EIO);
- if (blk_queue_enter(q, true) != 0)
- return rc;
-
- rc = bdev_direct_access(bdev, dax);
- if (rc < 0) {
- dax->addr = ERR_PTR(rc);
- blk_queue_exit(q);
- return rc;
- }
- return rc;
-}
-
-static void dax_unmap_atomic(struct block_device *bdev,
- const struct blk_dax_ctl *dax)
-{
- if (IS_ERR(dax->addr))
- return;
- blk_queue_exit(bdev->bd_queue);
-}
-
static int dax_is_pmd_entry(void *entry)
{
return (unsigned long)entry & RADIX_DAX_PMD;
@@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry)
return (unsigned long)entry & RADIX_DAX_EMPTY;
}
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
-{
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
- };
- long rc;
-
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- rc = dax_map_atomic(bdev, &dax);
- if (rc < 0)
- return ERR_PTR(rc);
- memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
- dax_unmap_atomic(bdev, &dax);
- return page;
-}
-
/*
* DAX radix tree locking
*/
@@ -555,21 +509,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
+ struct inode *inode = mapping->host;
struct page *page;
int ret;
/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
- goto out;
+ goto finish_fault;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- out:
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+finish_fault:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -577,26 +535,37 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
- return VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
}
+out:
+ trace_dax_load_hole(inode, vmf, ret);
return ret;
}
-static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
- struct page *to, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, size_t size, struct page *to,
+ unsigned long vaddr)
{
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = size,
- };
- void *vto;
-
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
+ void *vto, *kaddr;
+ pgoff_t pgoff;
+ pfn_t pfn;
+ long rc;
+ int id;
+
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+ copy_user_page(vto, (void __force *)kaddr, vaddr, to);
kunmap_atomic(vto);
- dax_unmap_atomic(bdev, &dax);
+ dax_read_unlock(id);
return 0;
}
@@ -764,12 +733,16 @@ unlock_pte:
}
static int dax_writeback_one(struct block_device *bdev,
- struct address_space *mapping, pgoff_t index, void *entry)
+ struct dax_device *dax_dev, struct address_space *mapping,
+ pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- struct blk_dax_ctl dax;
- void *entry2, **slot;
- int ret = 0;
+ void *entry2, **slot, *kaddr;
+ long ret = 0, id;
+ sector_t sector;
+ pgoff_t pgoff;
+ size_t size;
+ pfn_t pfn;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -818,26 +791,29 @@ static int dax_writeback_one(struct block_device *bdev,
* 'entry'. This allows us to flush for PMD_SIZE and not have to
* worry about partial PMD writebacks.
*/
- dax.sector = dax_radix_sector(entry);
- dax.size = PAGE_SIZE << dax_radix_order(entry);
+ sector = dax_radix_sector(entry);
+ size = PAGE_SIZE << dax_radix_order(entry);
+
+ id = dax_read_lock();
+ ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (ret)
+ goto dax_unlock;
/*
- * We cannot hold tree_lock while calling dax_map_atomic() because it
- * eventually calls cond_resched().
+ * dax_direct_access() may sleep, so cannot hold tree_lock over
+ * its invocation.
*/
- ret = dax_map_atomic(bdev, &dax);
- if (ret < 0) {
- put_locked_mapping_entry(mapping, index, entry);
- return ret;
- }
+ ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
+ if (ret < 0)
+ goto dax_unlock;
- if (WARN_ON_ONCE(ret < dax.size)) {
+ if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
ret = -EIO;
- goto unmap;
+ goto dax_unlock;
}
- dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
- wb_cache_pmem(dax.addr, dax.size);
+ dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
+ wb_cache_pmem(kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -847,8 +823,9 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
- unmap:
- dax_unmap_atomic(bdev, &dax);
+ trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
+ dax_unlock:
+ dax_read_unlock(id);
put_locked_mapping_entry(mapping, index, entry);
return ret;
@@ -869,6 +846,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct inode *inode = mapping->host;
pgoff_t start_index, end_index;
pgoff_t indices[PAGEVEC_SIZE];
+ struct dax_device *dax_dev;
struct pagevec pvec;
bool done = false;
int i, ret = 0;
@@ -879,9 +857,15 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ if (!dax_dev)
+ return -EIO;
+
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
+ trace_dax_writeback_range(inode, start_index, end_index);
+
tag_pages_for_writeback(mapping, start_index, end_index);
pagevec_init(&pvec, 0);
@@ -899,38 +883,50 @@ int dax_writeback_mapping_range(struct address_space *mapping,
break;
}
- ret = dax_writeback_one(bdev, mapping, indices[i],
- pvec.pages[i]);
+ ret = dax_writeback_one(bdev, dax_dev, mapping,
+ indices[i], pvec.pages[i]);
if (ret < 0)
- return ret;
+ goto out;
}
}
- return 0;
+out:
+ put_dax(dax_dev);
+ trace_dax_writeback_range_done(inode, start_index, end_index);
+ return (ret < 0 ? ret : 0);
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
- struct block_device *bdev, sector_t sector, size_t size,
- void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
+ struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, size_t size, void **entryp,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = size,
- };
- void *ret;
void *entry = *entryp;
+ void *ret, *kaddr;
+ pgoff_t pgoff;
+ int id, rc;
+ pfn_t pfn;
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- dax_unmap_atomic(bdev, &dax);
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
+ dax_read_unlock(id);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
*entryp = ret;
- return vm_insert_mixed(vma, vaddr, dax.pfn);
+ trace_dax_insert_mapping(mapping->host, vmf, ret);
+ return vm_insert_mixed(vma, vaddr, pfn);
}
/**
@@ -941,6 +937,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
@@ -950,6 +947,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -962,6 +960,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
+ trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -979,24 +978,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
return true;
}
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
- unsigned int offset, unsigned int length)
+int __dax_zero_page_range(struct block_device *bdev,
+ struct dax_device *dax_dev, sector_t sector,
+ unsigned int offset, unsigned int size)
{
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = PAGE_SIZE,
- };
-
- if (dax_range_is_aligned(bdev, offset, length)) {
- sector_t start_sector = dax.sector + (offset >> 9);
+ if (dax_range_is_aligned(bdev, offset, size)) {
+ sector_t start_sector = sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector,
- length >> 9, GFP_NOFS, true);
+ size >> 9, GFP_NOFS, 0);
} else {
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- dax_unmap_atomic(bdev, &dax);
+ pgoff_t pgoff;
+ long rc, id;
+ void *kaddr;
+ pfn_t pfn;
+
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+ &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
+ clear_pmem(kaddr + offset, size);
+ dax_read_unlock(id);
}
return 0;
}
@@ -1011,9 +1020,12 @@ static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
{
+ struct block_device *bdev = iomap->bdev;
+ struct dax_device *dax_dev = iomap->dax_dev;
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+ int id;
if (iov_iter_rw(iter) == READ) {
end = min(end, i_size_read(inode));
@@ -1038,34 +1050,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
(end - 1) >> PAGE_SHIFT);
}
+ id = dax_read_lock();
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
- struct blk_dax_ctl dax = { 0 };
+ const size_t size = ALIGN(length + offset, PAGE_SIZE);
+ const sector_t sector = dax_iomap_sector(iomap, pos);
ssize_t map_len;
+ pgoff_t pgoff;
+ void *kaddr;
+ pfn_t pfn;
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
- dax.sector = dax_iomap_sector(iomap, pos);
- dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
- map_len = dax_map_atomic(iomap->bdev, &dax);
+ ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (ret)
+ break;
+
+ map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
+ &kaddr, &pfn);
if (map_len < 0) {
ret = map_len;
break;
}
- dax.addr += offset;
+ map_len = PFN_PHYS(map_len);
+ kaddr += offset;
map_len -= offset;
if (map_len > end - pos)
map_len = end - pos;
if (iov_iter_rw(iter) == WRITE)
- map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+ map_len = copy_from_iter_pmem(kaddr, map_len, iter);
else
- map_len = copy_to_iter(dax.addr, map_len, iter);
- dax_unmap_atomic(iomap->bdev, &dax);
+ map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
ret = map_len ? map_len : -EFAULT;
break;
@@ -1075,6 +1095,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
length -= map_len;
done += map_len;
}
+ dax_read_unlock(id);
return done ? done : ret;
}
@@ -1142,13 +1163,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
int vmf_ret = 0;
void *entry;
+ trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ if (pos >= i_size_read(inode)) {
+ vmf_ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
@@ -1159,8 +1183,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
- if (error)
- return dax_fault_return(error);
+ if (error) {
+ vmf_ret = dax_fault_return(error);
+ goto out;
+ }
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
@@ -1181,8 +1207,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
clear_user_highpage(vmf->cow_page, vaddr);
break;
case IOMAP_MAPPED:
- error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
- vmf->cow_page, vaddr);
+ error = copy_user_dax(iomap.bdev, iomap.dax_dev,
+ sector, PAGE_SIZE, vmf->cow_page, vaddr);
break;
default:
WARN_ON_ONCE(1);
@@ -1207,8 +1233,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_insert_mapping(mapping, iomap.bdev, sector,
- PAGE_SIZE, &entry, vmf->vma, vmf);
+ error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
+ sector, PAGE_SIZE, &entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
@@ -1244,6 +1270,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
+out:
+ trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
@@ -1258,41 +1286,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
loff_t pos, void **entryp)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ const sector_t sector = dax_iomap_sector(iomap, pos);
+ struct dax_device *dax_dev = iomap->dax_dev;
struct block_device *bdev = iomap->bdev;
struct inode *inode = mapping->host;
- struct blk_dax_ctl dax = {
- .sector = dax_iomap_sector(iomap, pos),
- .size = PMD_SIZE,
- };
- long length = dax_map_atomic(bdev, &dax);
- void *ret = NULL;
-
- if (length < 0) /* dax_map_atomic() failed */
+ const size_t size = PMD_SIZE;
+ void *ret = NULL, *kaddr;
+ long length = 0;
+ pgoff_t pgoff;
+ pfn_t pfn;
+ int id;
+
+ if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto fallback;
- if (length < PMD_SIZE)
- goto unmap_fallback;
- if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
- goto unmap_fallback;
- if (!pfn_t_devmap(dax.pfn))
- goto unmap_fallback;
-
- dax_unmap_atomic(bdev, &dax);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+ id = dax_read_lock();
+ length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (length < 0)
+ goto unlock_fallback;
+ length = PFN_PHYS(length);
+
+ if (length < size)
+ goto unlock_fallback;
+ if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
+ goto unlock_fallback;
+ if (!pfn_t_devmap(pfn))
+ goto unlock_fallback;
+ dax_read_unlock(id);
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
*entryp = ret;
- trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+ trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
- dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
+ pfn, vmf->flags & FAULT_FLAG_WRITE);
- unmap_fallback:
- dax_unmap_atomic(bdev, &dax);
+unlock_fallback:
+ dax_read_unlock(id);
fallback:
- trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
- dax.pfn, ret);
+ trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
return VM_FAULT_FALLBACK;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 95d71eda8142..cddf39777835 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -419,6 +419,8 @@ static void dentry_lru_add(struct dentry *dentry)
{
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
d_lru_add(dentry);
+ else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ dentry->d_flags |= DCACHE_REFERENCED;
}
/**
@@ -779,8 +781,6 @@ repeat:
goto kill_it;
}
- if (!(dentry->d_flags & DCACHE_REFERENCED))
- dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
dentry->d_lockref.count--;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fd4ec4bb214..e892ae7d89f8 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = {
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr debug_files[] = {{""}};
+ static const struct tree_descr debug_files[] = {{""}};
struct debugfs_fs_info *fsi;
int err;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 95c1c8d34539..9c351bf757b2 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat {
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
- struct backing_dev_info bdi;
};
/* file private data. */
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 151872dcc1f4..9014479d0160 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
- rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
+ rc = super_setup_bdi(s);
if (rc)
goto out1;
ecryptfs_set_superblock_private(s, sbi);
- s->s_bdi = &sbi->bdi;
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
@@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
if (!sb_info)
return;
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
- bdi_destroy(&sb_info->bdi);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 341251421ced..5420767c9b68 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -42,6 +42,7 @@
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
+#include <net/busy_poll.h>
/*
* LOCKING:
@@ -224,6 +225,11 @@ struct eventpoll {
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ unsigned int napi_id;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep)
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static bool ep_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct eventpoll *ep = p;
+
+ return ep_events_available(ep) || busy_loop_timeout(start_time);
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/*
+ * Busy poll if globally on and supporting sockets found && no events,
+ * busy loop will return if need_resched or ep_events_available.
+ *
+ * we must do our busy polling with irqs enabled
+ */
+static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+ napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
+#endif
+}
+
+static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (ep->napi_id)
+ ep->napi_id = 0;
+#endif
+}
+
+/*
+ * Set epoll busy poll NAPI ID from sk.
+ */
+static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ struct eventpoll *ep;
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ int err;
+
+ if (!net_busy_loop_on())
+ return;
+
+ sock = sock_from_file(epi->ffd.file, &err);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+ ep = epi->ep;
+
+ /* Non-NAPI IDs can be rejected
+ * or
+ * Nothing to do if we already have this ID
+ */
+ if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
+ return;
+
+ /* record NAPI ID for use in next busy poll */
+ ep->napi_id = napi_id;
+#endif
+}
+
/**
* ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
@@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
spin_lock_irqsave(&ep->lock, flags);
+ ep_set_busy_poll_napi_id(epi);
+
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
@@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
+ /* record NAPI ID of new item if present */
+ ep_set_busy_poll_napi_id(epi);
+
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1637,10 +1719,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
fetch_events:
+
+ if (!ep_events_available(ep))
+ ep_busy_loop(ep, timed_out);
+
spin_lock_irqsave(&ep->lock, flags);
if (!ep_events_available(ep)) {
/*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
+ ep_reset_busy_poll_napi_id(ep);
+
+ /*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
diff --git a/fs/exec.c b/fs/exec.c
index 65145a3df065..72934df68471 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
+ arch_setup_new_exec();
perf_event_exec();
__set_task_comm(current, kbasename(bprm->filename), true);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 42f9a0a0c4ca..8eeb694332fe 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
int err;
lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, len,
- AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
if (err)
EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
err);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e86086bc940..5dc392404559 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -64,7 +64,6 @@ struct exofs_dev {
* our extension to the in-memory superblock
*/
struct exofs_sb_info {
- struct backing_dev_info bdi; /* register our bdi with VFS */
struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
int s_timeout; /* timeout for OSD operations */
uint64_t s_nextid; /* highest object ID used */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 1076a4233b39..819624cfc8da 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb)
sbi->one_comp.obj.partition);
exofs_sysfs_sb_del(sbi);
- bdi_destroy(&sbi->bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
}
@@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
__sbi_read_stats(sbi);
/* set up operation vectors */
- sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
- sb->s_bdi = &sbi->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret) {
+ EXOFS_DBGMSG("Failed to super_setup_bdi\n");
+ goto free_sbi;
+ }
+ sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
sb->s_export_op = &exofs_export_ops;
@@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- ret = bdi_setup_and_register(&sbi->bdi, "exofs");
- if (ret) {
- EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
- dput(sb->s_root);
- sb->s_root = NULL;
- goto free_sbi;
- }
-
exofs_sysfs_dbg_print();
_exofs_print_device("Mounting", opts->dev_name,
ore_comp_dev(&sbi->oc, 0),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 5e64de9c5093..03f5ce1d3dbe 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -779,7 +779,6 @@ extern void ext2_evict_inode(struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
extern void ext2_set_inode_flags(struct inode *inode);
-extern void ext2_get_inode_flags(struct ext2_inode_info *);
extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -796,7 +795,8 @@ void ext2_error(struct super_block *, const char *, const char *, ...);
extern __printf(3, 4)
void ext2_msg(struct super_block *, const char *, const char *, ...);
extern void ext2_update_dynamic_rev (struct super_block *sb);
-extern void ext2_write_super (struct super_block *);
+extern void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+ int wait);
/*
* Inodes and files operations
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 128cce540645..26d77f9f8c12 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
+ struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
+ bdev = inode->i_sb->s_bdev;
+ iomap->bdev = bdev;
iomap->offset = (u64)first_block << blkbits;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
if (ret == 0) {
iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
+ put_dax(iomap->dax_dev);
if (iomap->type == IOMAP_MAPPED &&
written < length &&
(flags & IOMAP_WRITE))
@@ -1384,25 +1391,6 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DAX;
}
-/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
-void ext2_get_inode_flags(struct ext2_inode_info *ei)
-{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
- EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT2_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT2_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT2_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT2_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT2_DIRSYNC_FL;
-}
-
struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
{
struct ext2_inode_info *ei;
@@ -1563,7 +1551,6 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
if (ei->i_state & EXT2_STATE_NEW)
memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
- ext2_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
if (!(test_opt(sb, NO_UID32))) {
raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
@@ -1615,7 +1602,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
EXT2_SET_RO_COMPAT_FEATURE(sb,
EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
spin_unlock(&EXT2_SB(sb)->s_lock);
- ext2_write_super(sb);
+ ext2_sync_super(sb, EXT2_SB(sb)->s_es, 1);
}
}
}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 191e02b28ce8..087f122cca42 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -29,7 +29,6 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case EXT2_IOC_GETFLAGS:
- ext2_get_inode_flags(ei);
flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT2_IOC_SETFLAGS: {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9e25a71fe1a2..8ac673c71a36 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -36,8 +36,7 @@
#include "xattr.h"
#include "acl.h"
-static void ext2_sync_super(struct super_block *sb,
- struct ext2_super_block *es, int wait);
+static void ext2_write_super(struct super_block *sb);
static int ext2_remount (struct super_block * sb, int * flags, char * data);
static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -123,13 +122,29 @@ void ext2_update_dynamic_rev(struct super_block *sb)
*/
}
+#ifdef CONFIG_QUOTA
+static int ext2_quota_off(struct super_block *sb, int type);
+
+static void ext2_quota_off_umount(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ ext2_quota_off(sb, type);
+}
+#else
+static inline void ext2_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
static void ext2_put_super (struct super_block * sb)
{
int db_count;
int i;
struct ext2_sb_info *sbi = EXT2_SB(sb);
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ ext2_quota_off_umount(sb);
if (sbi->s_mb_cache) {
ext2_xattr_destroy_cache(sbi->s_mb_cache);
@@ -314,10 +329,23 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
#ifdef CONFIG_QUOTA
static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
+static int ext2_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path);
static struct dquot **ext2_get_dquots(struct inode *inode)
{
return EXT2_I(inode)->i_dquot;
}
+
+static const struct quotactl_ops ext2_quotactl_ops = {
+ .quota_on = ext2_quota_on,
+ .quota_off = ext2_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_state = dquot_get_state,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
+};
#endif
static const struct super_operations ext2_sops = {
@@ -1117,7 +1145,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
- sb->s_qcop = &dquot_quotactl_ops;
+ sb->s_qcop = &ext2_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
@@ -1194,8 +1222,8 @@ static void ext2_clear_super_error(struct super_block *sb)
}
}
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
- int wait)
+void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+ int wait)
{
ext2_clear_super_error(sb);
spin_lock(&EXT2_SB(sb)->s_lock);
@@ -1270,7 +1298,7 @@ static int ext2_unfreeze(struct super_block *sb)
return 0;
}
-void ext2_write_super(struct super_block *sb)
+static void ext2_write_super(struct super_block *sb)
{
if (!(sb->s_flags & MS_RDONLY))
ext2_sync_fs(sb, 1);
@@ -1548,6 +1576,51 @@ out:
return len - towrite;
}
+static int ext2_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path)
+{
+ int err;
+ struct inode *inode;
+
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ return err;
+
+ inode = d_inode(path->dentry);
+ inode_lock(inode);
+ EXT2_I(inode)->i_flags |= EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL;
+ inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+ S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+static int ext2_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ int err;
+
+ if (!inode || !igrab(inode))
+ goto out;
+
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ EXT2_I(inode)->i_flags &= ~(EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL);
+ inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+out_put:
+ iput(inode);
+ return err;
+out:
+ return dquot_quota_off(sb, type);
+}
+
#endif
static struct file_system_type ext2_fs_type = {
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 354103f3490c..d9beca1653c5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,11 +4,11 @@
obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
- mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o readpage.o sysfs.o
+ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
+ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
+ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
+ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
+ super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fb69ee2388db..8e8046104f4d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2356,17 +2356,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-int ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!ext4_has_feature_dir_index(inode->i_sb))
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
-static unsigned char ext4_filetype_table[] = {
+static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
@@ -2477,7 +2476,6 @@ extern int ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
-extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
@@ -3051,7 +3049,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
[S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
new file mode 100644
index 000000000000..b19436098837
--- /dev/null
+++ b/fs/ext4/fsmap.c
@@ -0,0 +1,722 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include "mballoc.h"
+#include <linux/sort.h>
+#include <linux/list_sort.h>
+#include <trace/events/ext4.h>
+
+/* Convert an ext4_fsmap to an fsmap. */
+void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest,
+ struct ext4_fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits;
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = 0;
+ dest->fmr_length = src->fmr_length << sb->s_blocksize_bits;
+ dest->fmr_reserved[0] = 0;
+ dest->fmr_reserved[1] = 0;
+ dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an ext4_fsmap. */
+void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest,
+ struct fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits;
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits;
+}
+
+/* getfsmap query state */
+struct ext4_getfsmap_info {
+ struct ext4_fsmap_head *gfi_head;
+ ext4_fsmap_format_t gfi_formatter; /* formatting fn */
+ void *gfi_format_arg;/* format buffer */
+ ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */
+ u32 gfi_dev; /* device id */
+ ext4_group_t gfi_agno; /* bg number, if applicable */
+ struct ext4_fsmap gfi_low; /* low rmap key */
+ struct ext4_fsmap gfi_high; /* high rmap key */
+ struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */
+ struct list_head gfi_meta_list; /* fixed metadata list */
+ bool gfi_last; /* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct ext4_getfsmap_dev {
+ int (*gfd_fn)(struct super_block *sb,
+ struct ext4_fsmap *keys,
+ struct ext4_getfsmap_info *info);
+ u32 gfd_dev;
+};
+
+/* Compare two getfsmap device handlers. */
+static int ext4_getfsmap_dev_compare(const void *p1, const void *p2)
+{
+ const struct ext4_getfsmap_dev *d1 = p1;
+ const struct ext4_getfsmap_dev *d2 = p2;
+
+ return d1->gfd_dev - d2->gfd_dev;
+}
+
+/* Compare a record against our starting point */
+static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info,
+ struct ext4_fsmap *rec)
+{
+ return rec->fmr_physical < info->gfi_low.fmr_physical;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+static int ext4_getfsmap_helper(struct super_block *sb,
+ struct ext4_getfsmap_info *info,
+ struct ext4_fsmap *rec)
+{
+ struct ext4_fsmap fmr;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t rec_fsblk = rec->fmr_physical;
+ ext4_group_t agno;
+ ext4_grpblk_t cno;
+ int error;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Filter out records that start before our startpoint, if the
+ * caller requested that.
+ */
+ if (ext4_getfsmap_rec_before_low_key(info, rec)) {
+ rec_fsblk += rec->fmr_length;
+ if (info->gfi_next_fsblk < rec_fsblk)
+ info->gfi_next_fsblk = rec_fsblk;
+ return EXT4_QUERY_RANGE_CONTINUE;
+ }
+
+ /* Are we just counting mappings? */
+ if (info->gfi_head->fmh_count == 0) {
+ if (rec_fsblk > info->gfi_next_fsblk)
+ info->gfi_head->fmh_entries++;
+
+ if (info->gfi_last)
+ return EXT4_QUERY_RANGE_CONTINUE;
+
+ info->gfi_head->fmh_entries++;
+
+ rec_fsblk += rec->fmr_length;
+ if (info->gfi_next_fsblk < rec_fsblk)
+ info->gfi_next_fsblk = rec_fsblk;
+ return EXT4_QUERY_RANGE_CONTINUE;
+ }
+
+ /*
+ * If the record starts past the last physical block we saw,
+ * then we've found a gap. Report the gap as being owned by
+ * whatever the caller specified is the missing owner.
+ */
+ if (rec_fsblk > info->gfi_next_fsblk) {
+ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count)
+ return EXT4_QUERY_RANGE_ABORT;
+
+ ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk,
+ &agno, &cno);
+ trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno,
+ EXT4_C2B(sbi, cno),
+ rec_fsblk - info->gfi_next_fsblk,
+ EXT4_FMR_OWN_UNKNOWN);
+
+ fmr.fmr_device = info->gfi_dev;
+ fmr.fmr_physical = info->gfi_next_fsblk;
+ fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN;
+ fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ error = info->gfi_formatter(&fmr, info->gfi_format_arg);
+ if (error)
+ return error;
+ info->gfi_head->fmh_entries++;
+ }
+
+ if (info->gfi_last)
+ goto out;
+
+ /* Fill out the extent we found */
+ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count)
+ return EXT4_QUERY_RANGE_ABORT;
+
+ ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno);
+ trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno),
+ rec->fmr_length, rec->fmr_owner);
+
+ fmr.fmr_device = info->gfi_dev;
+ fmr.fmr_physical = rec_fsblk;
+ fmr.fmr_owner = rec->fmr_owner;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ fmr.fmr_length = rec->fmr_length;
+ error = info->gfi_formatter(&fmr, info->gfi_format_arg);
+ if (error)
+ return error;
+ info->gfi_head->fmh_entries++;
+
+out:
+ rec_fsblk += rec->fmr_length;
+ if (info->gfi_next_fsblk < rec_fsblk)
+ info->gfi_next_fsblk = rec_fsblk;
+ return EXT4_QUERY_RANGE_CONTINUE;
+}
+
+static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
+{
+ return fmr->fmr_physical + fmr->fmr_length;
+}
+
+/* Transform a blockgroup's free record into a fsmap */
+static int ext4_getfsmap_datadev_helper(struct super_block *sb,
+ ext4_group_t agno, ext4_grpblk_t start,
+ ext4_grpblk_t len, void *priv)
+{
+ struct ext4_fsmap irec;
+ struct ext4_getfsmap_info *info = priv;
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb;
+ ext4_fsblk_t fslen;
+ int error;
+
+ fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno));
+ fslen = EXT4_C2B(sbi, len);
+
+ /* If the retained free extent record is set... */
+ if (info->gfi_lastfree.fmr_owner) {
+ /* ...and abuts this one, lengthen it and return. */
+ if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) {
+ info->gfi_lastfree.fmr_length += fslen;
+ return 0;
+ }
+
+ /*
+ * There's a gap between the two free extents; emit the
+ * retained extent prior to merging the meta_list.
+ */
+ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree);
+ if (error)
+ return error;
+ info->gfi_lastfree.fmr_owner = 0;
+ }
+
+ /* Merge in any relevant extents from the meta_list */
+ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+ if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ } else if (p->fmr_physical < fsb) {
+ error = ext4_getfsmap_helper(sb, info, p);
+ if (error)
+ return error;
+
+ list_del(&p->fmr_list);
+ kfree(p);
+ }
+ }
+
+ irec.fmr_device = 0;
+ irec.fmr_physical = fsb;
+ irec.fmr_length = fslen;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
+ /* If this is a free extent at the end of a bg, buffer it. */
+ if (ext4_fsmap_next_pblk(&irec) ==
+ ext4_group_first_block_no(sb, agno + 1)) {
+ info->gfi_lastfree = irec;
+ return 0;
+ }
+
+ /* Otherwise, emit it */
+ return ext4_getfsmap_helper(sb, info, &irec);
+}
+
+/* Execute a getfsmap query against the log device. */
+static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
+ struct ext4_getfsmap_info *info)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+ struct ext4_fsmap irec;
+
+ /* Set up search keys */
+ info->gfi_low = keys[0];
+ info->gfi_low.fmr_length = 0;
+
+ memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
+
+ trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0,
+ info->gfi_low.fmr_physical,
+ info->gfi_low.fmr_length,
+ info->gfi_low.fmr_owner);
+
+ trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0,
+ info->gfi_high.fmr_physical,
+ info->gfi_high.fmr_length,
+ info->gfi_high.fmr_owner);
+
+ if (keys[0].fmr_physical > 0)
+ return 0;
+
+ /* Fabricate an rmap entry for the external log device. */
+ irec.fmr_physical = journal->j_blk_offset;
+ irec.fmr_length = journal->j_maxlen;
+ irec.fmr_owner = EXT4_FMR_OWN_LOG;
+ irec.fmr_flags = 0;
+
+ return ext4_getfsmap_helper(sb, info, &irec);
+}
+
+/* Helper to fill out an ext4_fsmap. */
+static inline int ext4_getfsmap_fill(struct list_head *meta_list,
+ ext4_fsblk_t fsb, ext4_fsblk_t len,
+ uint64_t owner)
+{
+ struct ext4_fsmap *fsm;
+
+ fsm = kmalloc(sizeof(*fsm), GFP_NOFS);
+ if (!fsm)
+ return -ENOMEM;
+ fsm->fmr_device = 0;
+ fsm->fmr_flags = 0;
+ fsm->fmr_physical = fsb;
+ fsm->fmr_owner = owner;
+ fsm->fmr_length = len;
+ list_add_tail(&fsm->fmr_list, meta_list);
+
+ return 0;
+}
+
+/*
+ * This function returns the number of file system metadata blocks at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
+ ext4_group_t agno,
+ struct list_head *meta_list)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno);
+ ext4_fsblk_t len;
+ unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+ unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb);
+ int error;
+
+ /* Record the superblock. */
+ if (ext4_bg_has_super(sb, agno)) {
+ error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS);
+ if (error)
+ return error;
+ fsb++;
+ }
+
+ /* Record the group descriptors. */
+ len = ext4_bg_num_gdb(sb, agno);
+ if (!len)
+ return 0;
+ error = ext4_getfsmap_fill(meta_list, fsb, len,
+ EXT4_FMR_OWN_GDT);
+ if (error)
+ return error;
+ fsb += len;
+
+ /* Reserved GDT blocks */
+ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
+ len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ error = ext4_getfsmap_fill(meta_list, fsb, len,
+ EXT4_FMR_OWN_RESV_GDT);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Compare two fsmap items. */
+static int ext4_getfsmap_compare(void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_fsmap *fa;
+ struct ext4_fsmap *fb;
+
+ fa = container_of(a, struct ext4_fsmap, fmr_list);
+ fb = container_of(b, struct ext4_fsmap, fmr_list);
+ if (fa->fmr_physical < fb->fmr_physical)
+ return -1;
+ else if (fa->fmr_physical > fb->fmr_physical)
+ return 1;
+ return 0;
+}
+
+/* Merge adjacent extents of fixed metadata. */
+static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list)
+{
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *prev = NULL;
+ struct ext4_fsmap *tmp;
+
+ list_for_each_entry_safe(p, tmp, meta_list, fmr_list) {
+ if (!prev) {
+ prev = p;
+ continue;
+ }
+
+ if (prev->fmr_owner == p->fmr_owner &&
+ prev->fmr_physical + prev->fmr_length == p->fmr_physical) {
+ prev->fmr_length += p->fmr_length;
+ list_del(&p->fmr_list);
+ kfree(p);
+ } else
+ prev = p;
+ }
+}
+
+/* Free a list of fixed metadata. */
+static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list)
+{
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+
+ list_for_each_entry_safe(p, tmp, meta_list, fmr_list) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ }
+}
+
+/* Find all the fixed metadata in the filesystem. */
+int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
+ struct list_head *meta_list)
+{
+ struct ext4_group_desc *gdp;
+ ext4_group_t agno;
+ int error;
+
+ INIT_LIST_HEAD(meta_list);
+
+ /* Collect everything. */
+ for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) {
+ gdp = ext4_get_group_desc(sb, agno, NULL);
+ if (!gdp) {
+ error = -EFSCORRUPTED;
+ goto err;
+ }
+
+ /* Superblock & GDT */
+ error = ext4_getfsmap_find_sb(sb, agno, meta_list);
+ if (error)
+ goto err;
+
+ /* Block bitmap */
+ error = ext4_getfsmap_fill(meta_list,
+ ext4_block_bitmap(sb, gdp), 1,
+ EXT4_FMR_OWN_BLKBM);
+ if (error)
+ goto err;
+
+ /* Inode bitmap */
+ error = ext4_getfsmap_fill(meta_list,
+ ext4_inode_bitmap(sb, gdp), 1,
+ EXT4_FMR_OWN_INOBM);
+ if (error)
+ goto err;
+
+ /* Inodes */
+ error = ext4_getfsmap_fill(meta_list,
+ ext4_inode_table(sb, gdp),
+ EXT4_SB(sb)->s_itb_per_group,
+ EXT4_FMR_OWN_INODES);
+ if (error)
+ goto err;
+ }
+
+ /* Sort the list */
+ list_sort(NULL, meta_list, ext4_getfsmap_compare);
+
+ /* Merge adjacent extents */
+ ext4_getfsmap_merge_fixed_metadata(meta_list);
+
+ return 0;
+err:
+ ext4_getfsmap_free_fixed_metadata(meta_list);
+ return error;
+}
+
+/* Execute a getfsmap query against the buddy bitmaps */
+static int ext4_getfsmap_datadev(struct super_block *sb,
+ struct ext4_fsmap *keys,
+ struct ext4_getfsmap_info *info)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start_fsb;
+ ext4_fsblk_t end_fsb;
+ ext4_fsblk_t eofs;
+ ext4_group_t start_ag;
+ ext4_group_t end_ag;
+ ext4_grpblk_t first_cluster;
+ ext4_grpblk_t last_cluster;
+ int error = 0;
+
+ eofs = ext4_blocks_count(sbi->s_es);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = keys[0].fmr_physical;
+ end_fsb = keys[1].fmr_physical;
+
+ /* Determine first and last group to examine based on start and end */
+ ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster);
+ ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster);
+
+ /*
+ * Convert the fsmap low/high keys to bg based keys. Initialize
+ * low to the fsmap low key and max out the high key to the end
+ * of the bg.
+ */
+ info->gfi_low = keys[0];
+ info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster);
+ info->gfi_low.fmr_length = 0;
+
+ memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
+
+ /* Assemble a list of all the fixed-location metadata. */
+ error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list);
+ if (error)
+ goto err;
+
+ /* Query each bg */
+ for (info->gfi_agno = start_ag;
+ info->gfi_agno <= end_ag;
+ info->gfi_agno++) {
+ /*
+ * Set the bg high key from the fsmap high key if this
+ * is the last bg that we're querying.
+ */
+ if (info->gfi_agno == end_ag) {
+ info->gfi_high = keys[1];
+ info->gfi_high.fmr_physical = EXT4_C2B(sbi,
+ last_cluster);
+ info->gfi_high.fmr_length = 0;
+ }
+
+ trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno,
+ info->gfi_low.fmr_physical,
+ info->gfi_low.fmr_length,
+ info->gfi_low.fmr_owner);
+
+ trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno,
+ info->gfi_high.fmr_physical,
+ info->gfi_high.fmr_length,
+ info->gfi_high.fmr_owner);
+
+ error = ext4_mballoc_query_range(sb, info->gfi_agno,
+ EXT4_B2C(sbi, info->gfi_low.fmr_physical),
+ EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ ext4_getfsmap_datadev_helper, info);
+ if (error)
+ goto err;
+
+ /*
+ * Set the bg low key to the start of the bg prior to
+ * moving on to the next bg.
+ */
+ if (info->gfi_agno == start_ag)
+ memset(&info->gfi_low, 0, sizeof(info->gfi_low));
+ }
+
+ /* Do we have a retained free extent? */
+ if (info->gfi_lastfree.fmr_owner) {
+ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree);
+ if (error)
+ goto err;
+ }
+
+ /* Report any gaps at the end of the bg */
+ info->gfi_last = true;
+ error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+ if (error)
+ goto err;
+
+err:
+ ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list);
+ return error;
+}
+
+/* Do we recognize the device? */
+static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
+ struct ext4_fsmap *fm)
+{
+ if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
+ return true;
+ if (EXT4_SB(sb)->journal_bdev &&
+ fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ return true;
+ return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key,
+ struct ext4_fsmap *high_key)
+{
+ if (low_key->fmr_device > high_key->fmr_device)
+ return false;
+ if (low_key->fmr_device < high_key->fmr_device)
+ return true;
+
+ if (low_key->fmr_physical > high_key->fmr_physical)
+ return false;
+ if (low_key->fmr_physical < high_key->fmr_physical)
+ return true;
+
+ if (low_key->fmr_owner > high_key->fmr_owner)
+ return false;
+ if (low_key->fmr_owner < high_key->fmr_owner)
+ return true;
+
+ return false;
+}
+
+#define EXT4_GETFSMAP_DEVS 2
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output. Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * _fsmap_head.fmh_keys -- low and high fsmap keys passed in;
+ * these reflect fs-wide block addrs.
+ * dkeys -- fmh_keys used to query each device;
+ * these are fmh_keys but w/ the low key
+ * bumped up by fmr_length.
+ * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this
+ * is how we detect gaps in the fsmap
+ * records and report them.
+ * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from
+ * dkeys; used to query the free space.
+ */
+int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
+ ext4_fsmap_format_t formatter, void *arg)
+{
+ struct ext4_fsmap dkeys[2]; /* per-dev keys */
+ struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS];
+ struct ext4_getfsmap_info info = {0};
+ int i;
+ int error = 0;
+
+ if (head->fmh_iflags & ~FMH_IF_VALID)
+ return -EINVAL;
+ if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) ||
+ !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ head->fmh_entries = 0;
+
+ /* Set up our device handlers. */
+ memset(handlers, 0, sizeof(handlers));
+ handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
+ handlers[0].gfd_fn = ext4_getfsmap_datadev;
+ if (EXT4_SB(sb)->journal_bdev) {
+ handlers[1].gfd_dev = new_encode_dev(
+ EXT4_SB(sb)->journal_bdev->bd_dev);
+ handlers[1].gfd_fn = ext4_getfsmap_logdev;
+ }
+
+ sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev),
+ ext4_getfsmap_dev_compare, NULL);
+
+ /*
+ * To continue where we left off, we allow userspace to use the
+ * last mapping from a previous call as the low key of the next.
+ * This is identified by a non-zero length in the low key. We
+ * have to increment the low key in this scenario to ensure we
+ * don't return the same mapping again, and instead return the
+ * very next mapping.
+ *
+ * Bump the physical offset as there can be no other mapping for
+ * the same physical block range.
+ */
+ dkeys[0] = head->fmh_keys[0];
+ dkeys[0].fmr_physical += dkeys[0].fmr_length;
+ dkeys[0].fmr_owner = 0;
+ dkeys[0].fmr_length = 0;
+ memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap));
+
+ if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical +
+ head->fmh_keys[0].fmr_length;
+ info.gfi_formatter = formatter;
+ info.gfi_format_arg = arg;
+ info.gfi_head = head;
+
+ /* For each device we support... */
+ for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) {
+ /* Is this device within the range the user asked for? */
+ if (!handlers[i].gfd_fn)
+ continue;
+ if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev)
+ continue;
+ if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev)
+ break;
+
+ /*
+ * If this device number matches the high key, we have
+ * to pass the high key to the handler to limit the
+ * query results. If the device number exceeds the
+ * low key, zero out the low key so that we get
+ * everything from the beginning.
+ */
+ if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device)
+ dkeys[1] = head->fmh_keys[1];
+ if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device)
+ memset(&dkeys[0], 0, sizeof(struct ext4_fsmap));
+
+ info.gfi_dev = handlers[i].gfd_dev;
+ info.gfi_last = false;
+ info.gfi_agno = -1;
+ error = handlers[i].gfd_fn(sb, dkeys, &info);
+ if (error)
+ break;
+ info.gfi_next_fsblk = 0;
+ }
+
+ head->fmh_oflags = FMH_OF_DEV_T;
+ return error;
+}
diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h
new file mode 100644
index 000000000000..9a2cd367cc66
--- /dev/null
+++ b/fs/ext4/fsmap.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __EXT4_FSMAP_H__
+#define __EXT4_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct ext4_fsmap {
+ struct list_head fmr_list;
+ dev_t fmr_device; /* device id */
+ uint32_t fmr_flags; /* mapping flags */
+ uint64_t fmr_physical; /* device offset of segment */
+ uint64_t fmr_owner; /* owner id */
+ uint64_t fmr_length; /* length of segment, blocks */
+};
+
+struct ext4_fsmap_head {
+ uint32_t fmh_iflags; /* control flags */
+ uint32_t fmh_oflags; /* output flags */
+ unsigned int fmh_count; /* # of entries in array incl. input */
+ unsigned int fmh_entries; /* # of entries filled in (output). */
+
+ struct ext4_fsmap fmh_keys[2]; /* low and high keys */
+};
+
+void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest,
+ struct ext4_fsmap *src);
+void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest,
+ struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *);
+
+int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
+ ext4_fsmap_format_t formatter, void *arg);
+
+#define EXT4_QUERY_RANGE_ABORT 1
+#define EXT4_QUERY_RANGE_CONTINUE 0
+
+/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */
+#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */
+#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */
+#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */
+#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */
+#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
+#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */
+#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */
+#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */
+#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */
+
+#endif /* __EXT4_FSMAP_H__ */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 17bc043308f3..98ac2f1f23b3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1098,6 +1098,17 @@ got:
if (err)
goto fail_drop;
+ /*
+ * Since the encryption xattr will always be unique, create it first so
+ * that it's less likely to end up in an external xattr block and
+ * prevent its deduplication.
+ */
+ if (encrypt) {
+ err = fscrypt_inherit_context(dir, inode, handle, true);
+ if (err)
+ goto fail_free_drop;
+ }
+
err = ext4_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
@@ -1119,12 +1130,6 @@ got:
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
- if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
- if (err)
- goto fail_free_drop;
- }
-
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 375fb1c05d49..d5dea4c293ef 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1034,7 +1034,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
- ext4_insert_dentry(dir, inode, de, inline_size, fname);
+ ext4_insert_dentry(inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b9ffa9f4191f..5834c4d76be8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1643,6 +1643,7 @@ struct mpage_da_data {
*/
struct ext4_map_blocks map;
struct ext4_io_submit io_submit; /* IO submission data */
+ unsigned int do_map:1;
};
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
@@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
/* First block in the extent? */
if (map->m_len == 0) {
+ /* We cannot map unless handle is started... */
+ if (!mpd->do_map)
+ return false;
map->m_lblk = lblk;
map->m_len = 1;
map->m_flags = bh->b_state & BH_FLAGS;
@@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
/* Found extent to map? */
if (mpd->map.m_len)
return 0;
+ /* Buffer needs mapping and handle is not started? */
+ if (!mpd->do_map)
+ return 0;
/* Everything mapped so far and we hit EOF */
break;
}
@@ -2747,6 +2754,29 @@ retry:
tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
done = false;
blk_start_plug(&plug);
+
+ /*
+ * First writeback pages that don't need mapping - we can avoid
+ * starting a transaction unnecessarily and also avoid being blocked
+ * in the block layer on device congestion while having transaction
+ * started.
+ */
+ mpd.do_map = 0;
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ goto unplug;
+ }
+ ret = mpage_prepare_extent_to_map(&mpd);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ ext4_put_io_end_defer(mpd.io_submit.io_end);
+ mpd.io_submit.io_end = NULL;
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, false);
+ if (ret < 0)
+ goto unplug;
+
while (!done && mpd.first_page <= mpd.last_page) {
/* For each extent of pages we use new io_end */
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
@@ -2775,8 +2805,10 @@ retry:
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
ext4_put_io_end(mpd.io_submit.io_end);
+ mpd.io_submit.io_end = NULL;
break;
}
+ mpd.do_map = 1;
trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
ret = mpage_prepare_extent_to_map(&mpd);
@@ -2807,6 +2839,7 @@ retry:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
+ mpd.do_map = 0;
}
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
@@ -2824,6 +2857,7 @@ retry:
ext4_journal_stop(handle);
} else
ext4_put_io_end(mpd.io_submit.io_end);
+ mpd.io_submit.io_end = NULL;
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2839,6 +2873,7 @@ retry:
if (ret)
break;
}
+unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
@@ -3305,6 +3340,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
+ struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3373,7 +3409,12 @@ retry:
}
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
+ bdev = inode->i_sb->s_bdev;
+ iomap->bdev = bdev;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
iomap->offset = first_block << blkbits;
if (ret == 0) {
@@ -3406,6 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
int blkbits = inode->i_blkbits;
bool truncate = false;
+ put_dax(iomap->dax_dev);
if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
return 0;
@@ -4502,31 +4544,6 @@ void ext4_set_inode_flags(struct inode *inode)
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
}
-/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
-void ext4_get_inode_flags(struct ext4_inode_info *ei)
-{
- unsigned int vfs_fl;
- unsigned long old_fl, new_fl;
-
- do {
- vfs_fl = ei->vfs_inode.i_flags;
- old_fl = ei->i_flags;
- new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
- EXT4_DIRSYNC_FL);
- if (vfs_fl & S_SYNC)
- new_fl |= EXT4_SYNC_FL;
- if (vfs_fl & S_APPEND)
- new_fl |= EXT4_APPEND_FL;
- if (vfs_fl & S_IMMUTABLE)
- new_fl |= EXT4_IMMUTABLE_FL;
- if (vfs_fl & S_NOATIME)
- new_fl |= EXT4_NOATIME_FL;
- if (vfs_fl & S_DIRSYNC)
- new_fl |= EXT4_DIRSYNC_FL;
- } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
-}
-
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
{
@@ -4963,7 +4980,6 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
i_uid = i_uid_read(inode);
i_gid = i_gid_read(inode);
@@ -5874,6 +5890,11 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
+
+ ret = ext4_convert_inline_data(inode);
+ if (ret)
+ goto out_ret;
+
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a4273ddb9922..0c21e22acd74 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -19,6 +19,9 @@
#include <linux/delay.h>
#include "ext4_jbd2.h"
#include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include <trace/events/ext4.h>
/**
* Swap memory between @a and @b for @len bytes.
@@ -443,7 +446,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
return iflags;
}
-int ext4_shutdown(struct super_block *sb, unsigned long arg)
+static int ext4_shutdown(struct super_block *sb, unsigned long arg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
__u32 flags;
@@ -489,6 +492,90 @@ int ext4_shutdown(struct super_block *sb, unsigned long arg)
return 0;
}
+struct getfsmap_info {
+ struct super_block *gi_sb;
+ struct fsmap_head __user *gi_data;
+ unsigned int gi_idx;
+ __u32 gi_last_flags;
+};
+
+static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv)
+{
+ struct getfsmap_info *info = priv;
+ struct fsmap fm;
+
+ trace_ext4_getfsmap_mapping(info->gi_sb, xfm);
+
+ info->gi_last_flags = xfm->fmr_flags;
+ ext4_fsmap_from_internal(info->gi_sb, &fm, xfm);
+ if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm,
+ sizeof(struct fsmap)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ext4_ioc_getfsmap(struct super_block *sb,
+ struct fsmap_head __user *arg)
+{
+ struct getfsmap_info info = {0};
+ struct ext4_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ bool aborted = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+ /*
+ * ext4 doesn't report file extents at all, so the only valid
+ * file offsets are the magic ones (all zeroes or all ones).
+ */
+ if (head.fmh_keys[0].fmr_offset ||
+ (head.fmh_keys[1].fmr_offset != 0 &&
+ head.fmh_keys[1].fmr_offset != -1ULL))
+ return -EINVAL;
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xhead.fmh_count = head.fmh_count;
+ ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]);
+ ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]);
+ trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]);
+
+ info.gi_sb = sb;
+ info.gi_data = arg;
+ error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
+ if (error == EXT4_QUERY_RANGE_ABORT) {
+ error = 0;
+ aborted = true;
+ } else if (error)
+ return error;
+
+ /* If we didn't abort, set the "last" flag in the last fmx */
+ if (!aborted && info.gi_idx) {
+ info.gi_last_flags |= FMR_OF_LAST;
+ if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags,
+ &info.gi_last_flags,
+ sizeof(info.gi_last_flags)))
+ return -EFAULT;
+ }
+
+ /* copy back header */
+ head.fmh_entries = xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+ return -EFAULT;
+
+ return 0;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -499,8 +586,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
+ case FS_IOC_GETFSMAP:
+ return ext4_ioc_getfsmap(sb, (void __user *)arg);
case EXT4_IOC_GETFLAGS:
- ext4_get_inode_flags(ei);
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
@@ -888,7 +976,6 @@ resizefs_out:
struct fsxattr fa;
memset(&fa, 0, sizeof(struct fsxattr));
- ext4_get_inode_flags(ei);
fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
if (ext4_has_feature_project(inode->i_sb)) {
@@ -1009,6 +1096,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
case EXT4_IOC_GET_ENCRYPTION_POLICY:
case EXT4_IOC_SHUTDOWN:
+ case FS_IOC_GETFSMAP:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 354dc1a894c2..5083bce20ac4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -357,7 +357,7 @@ static struct kmem_cache *ext4_free_data_cachep;
#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
-static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
"ext4_groupinfo_64k", "ext4_groupinfo_128k"
@@ -2393,7 +2393,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
return 0;
size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
- new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ new_groupinfo = kvzalloc(size, GFP_KERNEL);
if (!new_groupinfo) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
@@ -5277,3 +5277,52 @@ out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}
+
+/* Iterate all the free extents in the group. */
+int
+ext4_mballoc_query_range(
+ struct super_block *sb,
+ ext4_group_t group,
+ ext4_grpblk_t start,
+ ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn formatter,
+ void *priv)
+{
+ void *bitmap;
+ ext4_grpblk_t next;
+ struct ext4_buddy e4b;
+ int error;
+
+ error = ext4_mb_load_buddy(sb, group, &e4b);
+ if (error)
+ return error;
+ bitmap = e4b.bd_bitmap;
+
+ ext4_lock_group(sb, group);
+
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
+ if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+ while (start <= end) {
+ start = mb_find_next_zero_bit(bitmap, end + 1, start);
+ if (start > end)
+ break;
+ next = mb_find_next_bit(bitmap, end + 1, start);
+
+ ext4_unlock_group(sb, group);
+ error = formatter(sb, group, start, next - start, priv);
+ if (error)
+ goto out_unload;
+ ext4_lock_group(sb, group);
+
+ start = next + 1;
+ }
+
+ ext4_unlock_group(sb, group);
+out_unload:
+ ext4_mb_unload_buddy(&e4b);
+
+ return error;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 1aba469f8220..2bed62084a8c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -199,4 +199,21 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
return ext4_group_first_block_no(sb, fex->fe_group) +
(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
+
+typedef int (*ext4_mballoc_query_range_fn)(
+ struct super_block *sb,
+ ext4_group_t agno,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len,
+ void *priv);
+
+int
+ext4_mballoc_query_range(
+ struct super_block *sb,
+ ext4_group_t agno,
+ ext4_grpblk_t start,
+ ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn formatter,
+ void *priv);
+
#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07e5e1405771..b81f7d46f344 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1237,37 +1237,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
}
/*
- * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ * Test whether a directory entry matches the filename being searched for.
*
- * `len <= EXT4_NAME_LEN' is guaranteed by caller.
- * `de != NULL' is guaranteed by caller.
+ * Return: %true if the directory entry matches, otherwise %false.
*/
-static inline int ext4_match(struct ext4_filename *fname,
- struct ext4_dir_entry_2 *de)
+static inline bool ext4_match(const struct ext4_filename *fname,
+ const struct ext4_dir_entry_2 *de)
{
- const void *name = fname_name(fname);
- u32 len = fname_len(fname);
+ struct fscrypt_name f;
if (!de->inode)
- return 0;
+ return false;
+ f.usr_fname = fname->usr_fname;
+ f.disk_name = fname->disk_name;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (unlikely(!name)) {
- if (fname->usr_fname->name[0] == '_') {
- int ret;
- if (de->name_len < 16)
- return 0;
- ret = memcmp(de->name + de->name_len - 16,
- fname->crypto_buf.name + 8, 16);
- return (ret == 0) ? 1 : 0;
- }
- name = fname->crypto_buf.name;
- len = fname->crypto_buf.len;
- }
+ f.crypto_buf = fname->crypto_buf;
#endif
- if (de->name_len != len)
- return 0;
- return (memcmp(de->name, name, len) == 0) ? 1 : 0;
+ return fscrypt_match_name(&f, de->name, de->name_len);
}
/*
@@ -1281,48 +1268,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- int res;
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit) {
- res = ext4_match(fname, de);
- if (res < 0) {
- res = -1;
- goto return_result;
- }
- if (res > 0) {
- /* found a match - just to be sure, do
- * a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data,
- bh->b_size, offset)) {
- res = -1;
- goto return_result;
- }
- *res_dir = de;
- res = 1;
- goto return_result;
- }
-
+ if ((char *) de + de->name_len <= dlimit &&
+ ext4_match(fname, de)) {
+ /* found a match - just to be sure, do
+ * a full check */
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
+ return -1;
+ *res_dir = de;
+ return 1;
}
/* prevent looping on a bad block */
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
- if (de_len <= 0) {
- res = -1;
- goto return_result;
- }
+ if (de_len <= 0)
+ return -1;
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
-
- res = 0;
-return_result:
- return res;
+ return 0;
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1616,16 +1586,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
- int nokey = ext4_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode);
- if (nokey) {
- iput(inode);
- return ERR_PTR(-ENOKEY);
- }
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu",
- (unsigned long) dir->i_ino,
- (unsigned long) inode->i_ino);
+ dir->i_ino, inode->i_ino);
iput(inode);
return ERR_PTR(-EPERM);
}
@@ -1833,24 +1796,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
int nlen, rlen;
unsigned int offset = 0;
char *top;
- int res;
de = (struct ext4_dir_entry_2 *)buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- buf, buf_size, offset)) {
- res = -EFSCORRUPTED;
- goto return_result;
- }
- /* Provide crypto context and crypto buffer to ext4 match */
- res = ext4_match(fname, de);
- if (res < 0)
- goto return_result;
- if (res > 0) {
- res = -EEXIST;
- goto return_result;
- }
+ buf, buf_size, offset))
+ return -EFSCORRUPTED;
+ if (ext4_match(fname, de))
+ return -EEXIST;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1858,22 +1812,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
-
if ((char *) de > top)
- res = -ENOSPC;
- else {
- *dest_de = de;
- res = 0;
- }
-return_result:
- return res;
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
}
-int ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname)
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname)
{
int nlen, rlen;
@@ -1892,7 +1841,6 @@ int ext4_insert_dentry(struct inode *dir,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
- return 0;
}
/*
@@ -1928,11 +1876,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err;
}
- /* By now the buffer is marked for journaling. Due to crypto operations,
- * the following function call may fail */
- err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
- if (err < 0)
- return err;
+ /* By now the buffer is marked for journaling */
+ ext4_insert_dentry(inode, de, blocksize, fname);
/*
* XXX shouldn't update any times until successful
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 208241b06662..1a82138ba739 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -297,8 +297,17 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
+ char b[BDEVNAME_SIZE];
- BUG_ON(!io_end);
+ if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
+ bdevname(bio->bi_bdev, b),
+ (long long) bio->bi_iter.bi_sector,
+ (unsigned) bio_sectors(bio),
+ bio->bi_error)) {
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ return;
+ }
bio->bi_end_io = NULL;
if (bio->bi_error) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9448db1cf7e..c90edf09b0c3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -49,6 +49,7 @@
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
+#include "fsmap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -839,6 +840,28 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
}
}
+#ifdef CONFIG_QUOTA
+static int ext4_quota_off(struct super_block *sb, int type);
+
+static inline void ext4_quota_off_umount(struct super_block *sb)
+{
+ int type;
+
+ if (ext4_has_feature_quota(sb)) {
+ dquot_disable(sb, -1,
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ } else {
+ /* Use our quota_off function to clear inode flags etc. */
+ for (type = 0; type < EXT4_MAXQUOTAS; type++)
+ ext4_quota_off(sb, type);
+ }
+}
+#else
+static inline void ext4_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -847,7 +870,7 @@ static void ext4_put_super(struct super_block *sb)
int i, err;
ext4_unregister_li_request(sb);
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ ext4_quota_off_umount(sb);
flush_workqueue(sbi->rsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1208,7 +1231,7 @@ static const struct fscrypt_operations ext4_cryptops = {
#endif
#ifdef CONFIG_QUOTA
-static char *quotatypes[] = INITQFNAMES;
+static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
static int ext4_write_dquot(struct dquot *dquot);
@@ -1218,7 +1241,6 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static int ext4_quota_off(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -1422,7 +1444,8 @@ static ext4_fsblk_t get_sb_block(void **data)
}
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+static const char deprecated_msg[] =
+ "Mount option \"%s\" will be removed by %s\n"
"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
#ifdef CONFIG_QUOTA
@@ -2132,7 +2155,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
return 0;
size = roundup_pow_of_two(size * sizeof(struct flex_groups));
- new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ new_groups = kvzalloc(size, GFP_KERNEL);
if (!new_groups) {
ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
size / (int) sizeof(struct flex_groups));
@@ -3866,7 +3889,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
}
- sbi->s_group_desc = ext4_kvmalloc(db_count *
+ sbi->s_group_desc = kvmalloc(db_count *
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
@@ -3877,6 +3900,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
bgl_lock_init(sbi->s_blockgroup_lock);
+ /* Pre-read the descriptors into the buffer cache */
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ sb_breadahead(sb, block);
+ }
+
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
@@ -4629,7 +4658,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
if (sync) {
unlock_buffer(sbh);
error = __sync_dirty_buffer(sbh,
- test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
+ REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
if (error)
return error;
@@ -5344,11 +5373,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (err)
return err;
}
+
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err)
+ if (err) {
lockdep_set_quota_inode(path->dentry->d_inode,
I_DATA_SEM_NORMAL);
+ } else {
+ struct inode *inode = d_inode(path->dentry);
+ handle_t *handle;
+
+ /*
+ * Set inode flags to prevent userspace from messing with quota
+ * files. If this fails, we return success anyway since quotas
+ * are already enabled and this is not a hard failure.
+ */
+ inode_lock(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+ if (IS_ERR(handle))
+ goto unlock_inode;
+ EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
+ inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+ S_NOATIME | S_IMMUTABLE);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ unlock_inode:
+ inode_unlock(inode);
+ }
return err;
}
@@ -5422,24 +5473,39 @@ static int ext4_quota_off(struct super_block *sb, int type)
{
struct inode *inode = sb_dqopt(sb)->files[type];
handle_t *handle;
+ int err;
/* Force all delayed allocation blocks to be allocated.
* Caller already holds s_umount sem */
if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
- if (!inode)
+ if (!inode || !igrab(inode))
goto out;
- /* Update modification times of quota files when userspace can
- * start looking at them */
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ /*
+ * Update modification times of quota files when userspace can
+ * start looking at them. If we fail, we return success anyway since
+ * this is not a hard failure and quotas are already disabled.
+ */
handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
if (IS_ERR(handle))
- goto out;
+ goto out_unlock;
+ EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
+ inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
-
+out_unlock:
+ inode_unlock(inode);
+out_put:
+ iput(inode);
+ return err;
out:
return dquot_quota_off(sb, type);
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 42145be5c6b4..d74dc5f81a04 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -34,7 +34,7 @@ typedef enum {
ptr_ext4_super_block_offset,
} attr_ptr_t;
-static const char *proc_dirname = "fs/ext4";
+static const char proc_dirname[] = "fs/ext4";
static struct proc_dir_entry *ext4_proc_root;
struct ext4_attr {
@@ -375,7 +375,7 @@ static const struct file_operations ext4_seq_##name##_fops = { \
PROC_FILE_SHOW_DEFN(es_shrinker_info);
PROC_FILE_SHOW_DEFN(options);
-static struct ext4_proc_files {
+static const struct ext4_proc_files {
const char *name;
const struct file_operations *fops;
} proc_files[] = {
@@ -388,7 +388,7 @@ static struct ext4_proc_files {
int ext4_register_sysfs(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_proc_files *p;
+ const struct ext4_proc_files *p;
int err;
sbi->s_kobj.kset = &ext4_kset;
@@ -412,7 +412,7 @@ int ext4_register_sysfs(struct super_block *sb)
void ext4_unregister_sysfs(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_proc_files *p;
+ const struct ext4_proc_files *p;
if (sbi->s_proc) {
for (p = proc_files; p->name; p++)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 996e7900d4c8..8fb7ce14e6eb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -78,10 +78,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct mb_cache_entry **);
static void ext4_xattr_rehash(struct ext4_xattr_header *,
struct ext4_xattr_entry *);
-static int ext4_xattr_list(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-static const struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
@@ -163,20 +161,9 @@ ext4_xattr_handler(int name_index)
return handler;
}
-/*
- * Inode operation listxattr()
- *
- * d_inode(dentry)->i_mutex: don't care
- */
-ssize_t
-ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- return ext4_xattr_list(dentry, buffer, size);
-}
-
static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
- void *value_start)
+ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
+ void *value_start)
{
struct ext4_xattr_entry *e = entry;
@@ -230,8 +217,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
return -EFSCORRUPTED;
if (!ext4_xattr_block_csum_verify(inode, bh))
return -EFSBADCRC;
- error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
- bh->b_data);
+ error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
+ bh->b_data);
if (!error)
set_buffer_verified(bh);
return error;
@@ -246,7 +233,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
(header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
goto errout;
- error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
+ error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
errout:
if (error)
__ext4_error_inode(inode, function, line, 0,
@@ -257,20 +244,9 @@ errout:
#define xattr_check_inode(inode, header, end) \
__xattr_check_inode((inode), (header), (end), __func__, __LINE__)
-static inline int
-ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
-{
- size_t value_size = le32_to_cpu(entry->e_value_size);
-
- if (entry->e_value_block != 0 || value_size > size ||
- le16_to_cpu(entry->e_value_offs) + value_size > size)
- return -EFSCORRUPTED;
- return 0;
-}
-
static int
ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
- const char *name, size_t size, int sorted)
+ const char *name, int sorted)
{
struct ext4_xattr_entry *entry;
size_t name_len;
@@ -290,8 +266,6 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
break;
}
*pentry = entry;
- if (!cmp && ext4_xattr_check_entry(entry, size))
- return -EFSCORRUPTED;
return cmp ? -ENODATA : 0;
}
@@ -319,7 +293,6 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(inode, bh)) {
-bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EFSCORRUPTED;
@@ -327,9 +300,7 @@ bad_block:
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
- error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EFSCORRUPTED)
- goto bad_block;
+ error = ext4_xattr_find_entry(&entry, name_index, name, 1);
if (error)
goto cleanup;
size = le32_to_cpu(entry->e_value_size);
@@ -366,13 +337,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- entry = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
- error = ext4_xattr_find_entry(&entry, name_index, name,
- end - (void *)entry, 0);
+ entry = IFIRST(header);
+ error = ext4_xattr_find_entry(&entry, name_index, name, 0);
if (error)
goto cleanup;
size = le32_to_cpu(entry->e_value_size);
@@ -519,7 +489,9 @@ cleanup:
}
/*
- * ext4_xattr_list()
+ * Inode operation listxattr()
+ *
+ * d_inode(dentry)->i_rwsem: don't care
*
* Copy a list of attribute names into the buffer
* provided, or compute the buffer size required.
@@ -528,8 +500,8 @@ cleanup:
* Returns a negative error number on failure, or the number of bytes
* used / required on success.
*/
-static int
-ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int ret, ret2;
@@ -804,7 +776,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
bs->s.end = bs->bh->b_data + bs->bh->b_size;
bs->s.here = bs->s.first;
error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
- i->name, bs->bh->b_size, 1);
+ i->name, 1);
if (error && error != -ENODATA)
goto cleanup;
bs->s.not_found = error;
@@ -1076,8 +1048,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return error;
/* Find the named attribute. */
error = ext4_xattr_find_entry(&is->s.here, i->name_index,
- i->name, is->s.end -
- (void *)is->s.base, 0);
+ i->name, 0);
if (error && error != -ENODATA)
return error;
is->s.not_found = error;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0339daf4ca02..ea9c317b5916 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -275,10 +275,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
goto skip_write;
- trace_f2fs_writepages(mapping->host, wbc, META);
+ /* if locked failed, cp will flush dirty pages instead */
+ if (!mutex_trylock(&sbi->cp_mutex))
+ goto skip_write;
- /* if mounting is failed, skip writing node pages */
- mutex_lock(&sbi->cp_mutex);
+ trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = sync_meta_pages(sbi, META, wbc->nr_to_write);
mutex_unlock(&sbi->cp_mutex);
@@ -567,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
if (ni.blk_addr != NULL_ADDR) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_msg(sbi->sb, KERN_WARNING,
- "%s: orphan failed (ino=%x), run fsck to fix.",
+ "%s: orphan failed (ino=%x) by kernel, retry mount.",
__func__, ino);
return -EIO;
}
@@ -677,7 +678,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
- if (crc_offset >= blk_size) {
+ if (crc_offset > (blk_size - sizeof(__le32))) {
f2fs_msg(sbi->sb, KERN_WARNING,
"invalid crc_offset: %zu", crc_offset);
return -EINVAL;
@@ -816,7 +817,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
return;
set_inode_flag(inode, flag);
- list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
+ if (!f2fs_is_volatile_file(inode))
+ list_add_tail(&F2FS_I(inode)->dirty_list,
+ &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
@@ -941,6 +944,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
return 0;
}
+static void __prepare_cp_block(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
+
+ next_free_nid(sbi, &last_nid);
+ ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+ ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+ ckpt->next_free_nid = cpu_to_le32(last_nid);
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -964,21 +980,26 @@ retry_flush_dents:
err = sync_dirty_inodes(sbi, DIR_INODE);
if (err)
goto out;
+ cond_resched();
goto retry_flush_dents;
}
+ /*
+ * POR: we should ensure that there are no dirty node pages
+ * until finishing nat/sit flush. inode->i_blocks can be updated.
+ */
+ down_write(&sbi->node_change);
+
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
goto out;
+ cond_resched();
goto retry_flush_dents;
}
- /*
- * POR: we should ensure that there are no dirty node pages
- * until finishing nat/sit flush.
- */
retry_flush_nodes:
down_write(&sbi->node_write);
@@ -986,11 +1007,20 @@ retry_flush_nodes:
up_write(&sbi->node_write);
err = sync_node_pages(sbi, &wbc);
if (err) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
goto out;
}
+ cond_resched();
goto retry_flush_nodes;
}
+
+ /*
+ * sbi->node_change is used only for AIO write_begin path which produces
+ * dirty node blocks and some checkpoint values by block allocation.
+ */
+ __prepare_cp_block(sbi);
+ up_write(&sbi->node_change);
out:
blk_finish_plug(&plug);
return err;
@@ -1024,16 +1054,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
spin_lock(&sbi->cp_lock);
- if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count >
+ if ((cpc->reason & CP_UMOUNT) &&
+ le32_to_cpu(ckpt->cp_pack_total_block_count) >
sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
disable_nat_bits(sbi, false);
- if (cpc->reason == CP_UMOUNT)
+ if (cpc->reason & CP_TRIMMED)
+ __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
+ if (cpc->reason & CP_UMOUNT)
__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
else
__clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- if (cpc->reason == CP_FASTBOOT)
+ if (cpc->reason & CP_FASTBOOT)
__set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
else
__clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
@@ -1057,7 +1091,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
- nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
unsigned int data_sum_blocks, orphan_blocks;
__u32 crc32 = 0;
@@ -1074,14 +1107,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return -EIO;
}
- next_free_nid(sbi, &last_nid);
-
/*
* modify checkpoint
* version number is already updated
*/
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
- ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
@@ -1100,10 +1130,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
- ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
- ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
- ckpt->next_free_nid = cpu_to_le32(last_nid);
-
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = npages_for_summary_flush(sbi, false);
spin_lock(&sbi->cp_lock);
@@ -1143,7 +1169,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* write nat bits */
if (enabled_nat_bits(sbi, cpc)) {
__u64 cp_ver = cur_cp_version(ckpt);
- unsigned int i;
block_t blk;
cp_ver |= ((__u64)crc32 << 32);
@@ -1250,8 +1275,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
mutex_lock(&sbi->cp_mutex);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
- (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
- (cpc->reason == CP_DISCARD && !sbi->discard_blks)))
+ ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
+ ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
goto out;
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
@@ -1273,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_merged_bios(sbi);
/* this is the case of multiple fstrims without any changes */
- if (cpc->reason == CP_DISCARD) {
+ if (cpc->reason & CP_DISCARD) {
if (!exist_trim_candidates(sbi, cpc)) {
unblock_operations(sbi);
goto out;
@@ -1311,7 +1336,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
- if (cpc->reason == CP_RECOVERY)
+ if (cpc->reason & CP_RECOVERY)
f2fs_msg(sbi->sb, KERN_NOTICE,
"checkpoint: version = %llx", ckpt_ver);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1602b4bccae6..7c0f6bdf817d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -309,7 +309,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
io->fio.op = REQ_OP_WRITE;
- io->fio.op_flags = REQ_META | REQ_PRIO;
+ io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
if (!test_opt(sbi, NOBARRIER))
io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
}
@@ -341,7 +341,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
/*
* Fill the locked page with data located in the block address.
- * Return unlocked page.
+ * A caller needs to unlock the page on failure.
*/
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
@@ -362,6 +362,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
bio_set_op_attrs(bio, fio->op, fio->op_flags);
__submit_bio(fio->sbi, bio, fio->type);
+
+ if (!is_read_io(fio->op))
+ inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
return 0;
}
@@ -787,6 +790,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
return err;
}
+static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+{
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (lock)
+ down_read(&sbi->node_change);
+ else
+ up_read(&sbi->node_change);
+ } else {
+ if (lock)
+ f2fs_lock_op(sbi);
+ else
+ f2fs_unlock_op(sbi);
+ }
+}
+
/*
* f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
* f2fs_map_blocks structure.
@@ -829,7 +847,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
next_dnode:
if (create)
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, flag, true);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -939,7 +957,7 @@ skip:
f2fs_put_dnode(&dn);
if (create) {
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
goto next_dnode;
@@ -948,7 +966,7 @@ sync_out:
f2fs_put_dnode(&dn);
unlock_out:
if (create) {
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
@@ -1151,9 +1169,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
- prefetchw(&page->flags);
if (pages) {
page = list_last_entry(pages, struct page, lru);
+
+ prefetchw(&page->flags);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping,
page->index,
@@ -1283,17 +1302,83 @@ static int f2fs_read_data_pages(struct file *file,
return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
}
+static int encrypt_one_page(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+ gfp_t gfp_flags = GFP_NOFS;
+
+ if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ return 0;
+
+ /* wait for GCed encrypted page writeback */
+ f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr);
+
+retry_encrypt:
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+ PAGE_SIZE, 0, fio->page->index, gfp_flags);
+ if (!IS_ERR(fio->encrypted_page))
+ return 0;
+
+ /* flush pending IOs and wait for a while in the ENOMEM case */
+ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
+ f2fs_flush_merged_bios(fio->sbi);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
+ return PTR_ERR(fio->encrypted_page);
+}
+
+static inline bool need_inplace_update(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+ return false;
+ if (is_cold_data(fio->page))
+ return false;
+ if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+ return false;
+
+ return need_inplace_update_policy(inode, fio);
+}
+
+static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio)
+{
+ if (fio->old_blkaddr == NEW_ADDR)
+ return false;
+ if (fio->old_blkaddr == NULL_ADDR)
+ return false;
+ return true;
+}
+
int do_write_data_page(struct f2fs_io_info *fio)
{
struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
+ struct extent_info ei = {0,0,0};
+ bool ipu_force = false;
int err = 0;
set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (need_inplace_update(fio) &&
+ f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ fio->old_blkaddr = ei.blk + page->index - ei.fofs;
+
+ if (valid_ipu_blkaddr(fio)) {
+ ipu_force = true;
+ fio->need_lock = false;
+ goto got_it;
+ }
+ }
+
+ if (fio->need_lock)
+ f2fs_lock_op(fio->sbi);
+
err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
if (err)
- return err;
+ goto out;
fio->old_blkaddr = dn.data_blkaddr;
@@ -1302,31 +1387,10 @@ int do_write_data_page(struct f2fs_io_info *fio)
ClearPageUptodate(page);
goto out_writepage;
}
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- gfp_t gfp_flags = GFP_NOFS;
-
- /* wait for GCed encrypted page writeback */
- f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
- fio->old_blkaddr);
-retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
- PAGE_SIZE, 0,
- fio->page->index,
- gfp_flags);
- if (IS_ERR(fio->encrypted_page)) {
- err = PTR_ERR(fio->encrypted_page);
- if (err == -ENOMEM) {
- /* flush pending ios and wait for a while */
- f2fs_flush_merged_bios(F2FS_I_SB(inode));
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- gfp_flags |= __GFP_NOFAIL;
- err = 0;
- goto retry_encrypt;
- }
- goto out_writepage;
- }
- }
+got_it:
+ err = encrypt_one_page(fio);
+ if (err)
+ goto out_writepage;
set_page_writeback(page);
@@ -1334,22 +1398,27 @@ retry_encrypt:
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(fio->old_blkaddr != NEW_ADDR &&
- !is_cold_data(page) &&
- !IS_ATOMIC_WRITTEN_PAGE(page) &&
- need_inplace_update(inode))) {
- rewrite_data_page(fio);
+ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) {
+ f2fs_put_dnode(&dn);
+ if (fio->need_lock)
+ f2fs_unlock_op(fio->sbi);
+ err = rewrite_data_page(fio);
+ trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
- trace_f2fs_do_write_data_page(page, IPU);
- } else {
- write_data_page(&dn, fio);
- trace_f2fs_do_write_data_page(page, OPU);
- set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+ return err;
}
+
+ /* LFS mode write path */
+ write_data_page(&dn, fio);
+ trace_f2fs_do_write_data_page(page, OPU);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
+out:
+ if (fio->need_lock)
+ f2fs_unlock_op(fio->sbi);
return err;
}
@@ -1370,9 +1439,11 @@ static int __write_data_page(struct page *page, bool *submitted,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = wbc_to_write_flags(wbc),
+ .old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
.submitted = false,
+ .need_lock = true,
};
trace_f2fs_writepage(page, DATA);
@@ -1408,6 +1479,7 @@ write:
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
+ fio.need_lock = false;
err = do_write_data_page(&fio);
goto done;
}
@@ -1416,6 +1488,8 @@ write:
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0, 0))
goto redirty_out;
+ else
+ set_inode_flag(inode, FI_HOT_DATA);
err = -EAGAIN;
if (f2fs_has_inline_data(inode)) {
@@ -1423,12 +1497,12 @@ write:
if (!err)
goto out;
}
- f2fs_lock_op(sbi);
+
if (err == -EAGAIN)
err = do_write_data_page(&fio);
if (F2FS_I(inode)->last_disk_size < psize)
F2FS_I(inode)->last_disk_size = psize;
- f2fs_unlock_op(sbi);
+
done:
if (err && err != -ENOENT)
goto redirty_out;
@@ -1441,12 +1515,14 @@ out:
if (wbc->for_reclaim) {
f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index,
DATA, WRITE);
+ clear_inode_flag(inode, FI_HOT_DATA);
remove_dirty_inode(inode);
submitted = NULL;
}
unlock_page(page);
- f2fs_balance_fs(sbi, need_balance_fs);
+ if (!S_ISDIR(inode->i_mode))
+ f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
@@ -1495,6 +1571,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
+ if (get_dirty_pages(mapping->host) <=
+ SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
+ set_inode_flag(mapping->host, FI_HOT_DATA);
+ else
+ clear_inode_flag(mapping->host, FI_HOT_DATA);
+
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
@@ -1580,8 +1662,10 @@ continue_unlock:
last_idx = page->index;
}
- if (--wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
+ /* give a priority to WB_SYNC threads */
+ if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) ||
+ --wbc->nr_to_write <= 0) &&
+ wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
@@ -1637,9 +1721,18 @@ static int f2fs_write_data_pages(struct address_space *mapping,
trace_f2fs_writepages(mapping->host, wbc, DATA);
+ /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_inc(&sbi->wb_sync_req);
+ else if (atomic_read(&sbi->wb_sync_req))
+ goto skip_write;
+
blk_start_plug(&plug);
ret = f2fs_write_cache_pages(mapping, wbc);
blk_finish_plug(&plug);
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_dec(&sbi->wb_sync_req);
/*
* if some pages were truncated, we cannot guarantee its mapping->host
* to detect pending bios.
@@ -1687,7 +1780,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
if (f2fs_has_inline_data(inode) ||
(pos & PAGE_MASK) >= i_size_read(inode)) {
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
locked = true;
}
restart:
@@ -1723,7 +1816,8 @@ restart:
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err || dn.data_blkaddr == NULL_ADDR) {
f2fs_put_dnode(&dn);
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+ true);
locked = true;
goto restart;
}
@@ -1737,7 +1831,7 @@ out:
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
return err;
}
@@ -1951,7 +2045,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
/* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
- return;
+ return drop_inmem_page(inode, page);
set_page_private(page, 0);
ClearPagePrivate(page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ee2d0a485fc3..87f449845f5f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -51,15 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->aw_cnt = atomic_read(&sbi->aw_cnt);
+ si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
+ si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
- if (SM_I(sbi) && SM_I(sbi)->fcc_info)
- si->nr_flush =
- atomic_read(&SM_I(sbi)->fcc_info->submit_flush);
- if (SM_I(sbi) && SM_I(sbi)->dcc_info)
- si->nr_discard =
- atomic_read(&SM_I(sbi)->dcc_info->submit_discard);
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+ si->nr_flushed =
+ atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
+ si->nr_flushing =
+ atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+ }
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+ si->nr_discarded =
+ atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
+ si->nr_discarding =
+ atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+ si->nr_discard_cmd =
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
+ }
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -86,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+ si->avail_nids = NM_I(sbi)->available_nids;
si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
si->bg_gc = sbi->bg_gc;
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -99,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
si->curseg[i] = curseg->segno;
- si->cursec[i] = curseg->segno / sbi->segs_per_sec;
- si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+ si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
+ si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
}
for (i = 0; i < 2; i++) {
@@ -124,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+ blks_per_sec = BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -156,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
if (si->base_mem)
goto get_cache;
- si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+ /* build stat */
+ si->base_mem = sizeof(struct f2fs_stat_info);
+
+ /* build superblock */
+ si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
si->base_mem += 2 * sizeof(struct f2fs_inode_info);
si->base_mem += sizeof(*sbi->ckpt);
si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
@@ -208,8 +224,11 @@ get_cache:
/* build merge flush thread */
if (SM_I(sbi)->fcc_info)
si->cache_mem += sizeof(struct flush_cmd_control);
- if (SM_I(sbi)->dcc_info)
+ if (SM_I(sbi)->dcc_info) {
si->cache_mem += sizeof(struct discard_cmd_control);
+ si->cache_mem += sizeof(struct discard_cmd) *
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ }
/* free nids */
si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
@@ -330,11 +349,16 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n",
+ seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+ "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
si->nr_wb_cp_data, si->nr_wb_data,
- si->nr_flush, si->nr_discard);
- seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n",
- si->inmem_pages, si->aw_cnt, si->max_aw_cnt);
+ si->nr_flushing, si->nr_flushed,
+ si->nr_discarding, si->nr_discarded,
+ si->nr_discard_cmd, si->undiscard_blks);
+ seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
+ "volatile IO: %4d (Max. %4d)\n",
+ si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
+ si->vw_cnt, si->max_vw_cnt);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
@@ -347,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_imeta);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
- seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n",
- si->free_nids, si->alloc_nids);
+ seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
+ si->free_nids, si->avail_nids, si->alloc_nids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -434,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inplace_count, 0);
atomic_set(&sbi->aw_cnt, 0);
+ atomic_set(&sbi->vw_cnt, 0);
atomic_set(&sbi->max_aw_cnt, 0);
+ atomic_set(&sbi->max_vw_cnt, 0);
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8d5c62b07b28..94756f55a97e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
de = find_target_dentry(fname, namehash, max_slots, &d);
if (de)
*res_page = dentry_page;
@@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
- struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
- struct fscrypt_str *name = &fname->disk_name;
if (max_slots)
*max_slots = 0;
@@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
continue;
}
- /* encrypted case */
- de_name.name = d->filename[bit_pos];
- de_name.len = le16_to_cpu(de->name_len);
-
- /* show encrypted name */
- if (fname->hash) {
- if (de->hash_code == cpu_to_le32(fname->hash))
- goto found;
- } else if (de_name.len == name->len &&
- de->hash_code == namehash &&
- !memcmp(de_name.name, name->name, name->len))
+ if (de->hash_code == namehash &&
+ fscrypt_match_name(fname, d->filename[bit_pos],
+ le16_to_cpu(de->name_len)))
goto found;
if (max_slots && max_len > *max_slots)
@@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
struct f2fs_dir_entry *de = NULL;
bool room = false;
int max_slots;
- f2fs_hash_t namehash;
-
- if(fname->hash)
- namehash = cpu_to_le32(fname->hash);
- else
- namehash = f2fs_dentry_hash(&name);
+ f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -207,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
f2fs_put_page(dentry_page, 0);
}
- /* This is to increase the speed of f2fs_create */
- if (!de && room) {
- F2FS_I(dir)->task = current;
- if (F2FS_I(dir)->chash != namehash) {
- F2FS_I(dir)->chash = namehash;
- F2FS_I(dir)->clevel = level;
- }
+ if (!de && room && F2FS_I(dir)->chash != namehash) {
+ F2FS_I(dir)->chash = namehash;
+ F2FS_I(dir)->clevel = level;
}
return de;
@@ -254,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
break;
}
out:
+ /* This is to increase the speed of f2fs_create */
+ if (!de)
+ F2FS_I(dir)->task = current;
return de;
}
@@ -337,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-int update_dent_inode(struct inode *inode, struct inode *to,
- const struct qstr *name)
-{
- struct page *page;
-
- if (file_enc_name(to))
- return 0;
-
- page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- init_dent_inode(name, page);
- f2fs_put_page(page, 1);
-
- return 0;
-}
-
void do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d)
{
@@ -384,7 +350,7 @@ static int make_empty_dir(struct inode *inode,
dentry_blk = kmap_atomic(dentry_page);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
do_make_empty_dir(inode, parent, &d);
kunmap_atomic(dentry_blk);
@@ -438,8 +404,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
set_cold_node(inode, page);
}
- if (new_name)
+ if (new_name) {
init_dent_inode(new_name, page);
+ if (f2fs_encrypted_inode(dir))
+ file_set_enc_name(inode);
+ }
/*
* This file should be checkpointed during fsync.
@@ -542,7 +511,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
level = 0;
slots = GET_DENTRY_SLOTS(new_name->len);
- dentry_hash = f2fs_dentry_hash(new_name);
+ dentry_hash = f2fs_dentry_hash(new_name, NULL);
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
@@ -599,11 +568,9 @@ add_dentry:
err = PTR_ERR(page);
goto fail;
}
- if (f2fs_encrypted_inode(dir))
- file_set_enc_name(inode);
}
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
set_page_dirty(dentry_page);
@@ -911,7 +878,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
dentry_blk = kmap(dentry_page);
- make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(inode, &d, dentry_blk);
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c6934f014e0f..2f98d7039701 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -18,6 +18,179 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
+ unsigned int ofs)
+{
+ if (cached_re) {
+ if (cached_re->ofs <= ofs &&
+ cached_re->ofs + cached_re->len > ofs) {
+ return cached_re;
+ }
+ }
+ return NULL;
+}
+
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+ unsigned int ofs)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_entry *re;
+
+ while (node) {
+ re = rb_entry(node, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ node = node->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ node = node->rb_right;
+ else
+ return re;
+ }
+ return NULL;
+}
+
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs)
+{
+ struct rb_entry *re;
+
+ re = __lookup_rb_tree_fast(cached_re, ofs);
+ if (!re)
+ return __lookup_rb_tree_slow(root, ofs);
+
+ return re;
+}
+
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_entry *re;
+
+ while (*p) {
+ *parent = *p;
+ re = rb_entry(*parent, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ p = &(*p)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ p = &(*p)->rb_right;
+ else
+ f2fs_bug_on(sbi, 1);
+ }
+
+ return p;
+}
+
+/*
+ * lookup rb entry in position of @ofs in rb-tree,
+ * if hit, return the entry, otherwise, return NULL
+ * @prev_ex: extent before ofs
+ * @next_ex: extent after ofs
+ * @insert_p: insert point for new extent at ofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re,
+ unsigned int ofs,
+ struct rb_entry **prev_entry,
+ struct rb_entry **next_entry,
+ struct rb_node ***insert_p,
+ struct rb_node **insert_parent,
+ bool force)
+{
+ struct rb_node **pnode = &root->rb_node;
+ struct rb_node *parent = NULL, *tmp_node;
+ struct rb_entry *re = cached_re;
+
+ *insert_p = NULL;
+ *insert_parent = NULL;
+ *prev_entry = NULL;
+ *next_entry = NULL;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ if (re) {
+ if (re->ofs <= ofs && re->ofs + re->len > ofs)
+ goto lookup_neighbors;
+ }
+
+ while (*pnode) {
+ parent = *pnode;
+ re = rb_entry(*pnode, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ pnode = &(*pnode)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ pnode = &(*pnode)->rb_right;
+ else
+ goto lookup_neighbors;
+ }
+
+ *insert_p = pnode;
+ *insert_parent = parent;
+
+ re = rb_entry(parent, struct rb_entry, rb_node);
+ tmp_node = parent;
+ if (parent && ofs > re->ofs)
+ tmp_node = rb_next(parent);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+
+ tmp_node = parent;
+ if (parent && ofs < re->ofs)
+ tmp_node = rb_prev(parent);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ return NULL;
+
+lookup_neighbors:
+ if (ofs == re->ofs || force) {
+ /* lookup prev node for merging backward later */
+ tmp_node = rb_prev(&re->rb_node);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ if (ofs == re->ofs + re->len - 1 || force) {
+ /* lookup next node for merging frontward later */
+ tmp_node = rb_next(&re->rb_node);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ return re;
+}
+
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+ struct rb_node *cur = rb_first(root), *next;
+ struct rb_entry *cur_re, *next_re;
+
+ if (!cur)
+ return true;
+
+ while (cur) {
+ next = rb_next(cur);
+ if (!next)
+ return true;
+
+ cur_re = rb_entry(cur, struct rb_entry, rb_node);
+ next_re = rb_entry(next, struct rb_entry, rb_node);
+
+ if (cur_re->ofs + cur_re->len > next_re->ofs) {
+ f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
+ "cur(%u, %u) next(%u, %u)",
+ cur_re->ofs, cur_re->len,
+ next_re->ofs, next_re->len);
+ return false;
+ }
+
+ cur = next;
+ }
+#endif
+ return true;
+}
+
static struct kmem_cache *extent_tree_slab;
static struct kmem_cache *extent_node_slab;
@@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
return et;
}
-static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, unsigned int fofs)
-{
- struct rb_node *node = et->root.rb_node;
- struct extent_node *en = et->cached_en;
-
- if (en) {
- struct extent_info *cei = &en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
- stat_inc_cached_node_hit(sbi);
- return en;
- }
- }
-
- while (node) {
- en = rb_entry(node, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs) {
- node = node->rb_left;
- } else if (fofs >= en->ei.fofs + en->ei.len) {
- node = node->rb_right;
- } else {
- stat_inc_rbtree_node_hit(sbi);
- return en;
- }
- }
- return NULL;
-}
-
static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei)
{
@@ -237,17 +380,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
}
- en = __lookup_extent_tree(sbi, et, pgofs);
- if (en) {
- *ei = en->ei;
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
- et->cached_en = en;
- }
- spin_unlock(&sbi->extent_lock);
- ret = true;
+ en = (struct extent_node *)__lookup_rb_tree(&et->root,
+ (struct rb_entry *)et->cached_en, pgofs);
+ if (!en)
+ goto out;
+
+ if (en == et->cached_en)
+ stat_inc_cached_node_hit(sbi);
+ else
+ stat_inc_rbtree_node_hit(sbi);
+
+ *ei = en->ei;
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
}
+ spin_unlock(&sbi->extent_lock);
+ ret = true;
out:
stat_inc_total_hit(sbi);
read_unlock(&et->lock);
@@ -256,83 +406,6 @@ out:
return ret;
}
-
-/*
- * lookup extent at @fofs, if hit, return the extent
- * if not, return NULL and
- * @prev_ex: extent before fofs
- * @next_ex: extent after fofs
- * @insert_p: insert point for new extent at fofs
- * in order to simpfy the insertion after.
- * tree must stay unchanged between lookup and insertion.
- */
-static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
- unsigned int fofs,
- struct extent_node **prev_ex,
- struct extent_node **next_ex,
- struct rb_node ***insert_p,
- struct rb_node **insert_parent)
-{
- struct rb_node **pnode = &et->root.rb_node;
- struct rb_node *parent = NULL, *tmp_node;
- struct extent_node *en = et->cached_en;
-
- *insert_p = NULL;
- *insert_parent = NULL;
- *prev_ex = NULL;
- *next_ex = NULL;
-
- if (RB_EMPTY_ROOT(&et->root))
- return NULL;
-
- if (en) {
- struct extent_info *cei = &en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
- goto lookup_neighbors;
- }
-
- while (*pnode) {
- parent = *pnode;
- en = rb_entry(*pnode, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs)
- pnode = &(*pnode)->rb_left;
- else if (fofs >= en->ei.fofs + en->ei.len)
- pnode = &(*pnode)->rb_right;
- else
- goto lookup_neighbors;
- }
-
- *insert_p = pnode;
- *insert_parent = parent;
-
- en = rb_entry(parent, struct extent_node, rb_node);
- tmp_node = parent;
- if (parent && fofs > en->ei.fofs)
- tmp_node = rb_next(parent);
- *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-
- tmp_node = parent;
- if (parent && fofs < en->ei.fofs)
- tmp_node = rb_prev(parent);
- *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
- return NULL;
-
-lookup_neighbors:
- if (fofs == en->ei.fofs) {
- /* lookup prev node for merging backward later */
- tmp_node = rb_prev(&en->rb_node);
- *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
- }
- if (fofs == en->ei.fofs + en->ei.len - 1) {
- /* lookup next node for merging frontward later */
- tmp_node = rb_next(&en->rb_node);
- *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
- }
- return en;
-}
-
static struct extent_node *__try_merge_extent_node(struct inode *inode,
struct extent_tree *et, struct extent_info *ei,
struct extent_node *prev_ex,
@@ -387,17 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
goto do_insert;
}
- while (*p) {
- parent = *p;
- en = rb_entry(parent, struct extent_node, rb_node);
-
- if (ei->fofs < en->ei.fofs)
- p = &(*p)->rb_left;
- else if (ei->fofs >= en->ei.fofs + en->ei.len)
- p = &(*p)->rb_right;
- else
- f2fs_bug_on(sbi, 1);
- }
+ p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
do_insert:
en = __attach_extent_node(sbi, et, ei, parent, p);
if (!en)
@@ -447,8 +510,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
__drop_largest_extent(inode, fofs, len);
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
- en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
- &insert_p, &insert_parent);
+ en = (struct extent_node *)__lookup_rb_tree_ret(&et->root,
+ (struct rb_entry *)et->cached_en, fofs,
+ (struct rb_entry **)&prev_en,
+ (struct rb_entry **)&next_en,
+ &insert_p, &insert_parent, false);
if (!en)
en = next_en;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0a6e115562f6..2185c7a040a1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -50,6 +50,7 @@ enum {
FAULT_BLOCK,
FAULT_DIR_DEPTH,
FAULT_EVICT_INODE,
+ FAULT_TRUNCATE,
FAULT_IO,
FAULT_CHECKPOINT,
FAULT_MAX,
@@ -62,7 +63,7 @@ struct f2fs_fault_info {
};
extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
#endif
/*
@@ -88,9 +89,9 @@ extern char *fault_name[FAULT_MAX];
#define F2FS_MOUNT_ADAPTIVE 0x00020000
#define F2FS_MOUNT_LFS 0x00040000
-#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
#define ver_after(a, b) (typecheck(unsigned long long, a) && \
typecheck(unsigned long long, b) && \
@@ -124,22 +125,20 @@ enum {
SIT_BITMAP
};
-enum {
- CP_UMOUNT,
- CP_FASTBOOT,
- CP_SYNC,
- CP_RECOVERY,
- CP_DISCARD,
-};
+#define CP_UMOUNT 0x00000001
+#define CP_FASTBOOT 0x00000002
+#define CP_SYNC 0x00000004
+#define CP_RECOVERY 0x00000008
+#define CP_DISCARD 0x00000010
+#define CP_TRIMMED 0x00000020
#define DEF_BATCHED_TRIM_SECTIONS 2048
#define BATCHED_TRIM_SEGMENTS(sbi) \
- (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+ (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections))
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
-#define MAX_DISCARD_BLOCKS(sbi) \
- ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec)
-#define DISCARD_ISSUE_RATE 8
+#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
+#define DISCARD_ISSUE_RATE 8
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
@@ -181,37 +180,63 @@ struct inode_entry {
struct inode *inode; /* vfs inode pointer */
};
-/* for the list of blockaddresses to be discarded */
+/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
- block_t blkaddr; /* block address to be discarded */
- int len; /* # of consecutive blocks of the discard */
+ block_t start_blkaddr; /* start blockaddr of current segment */
+ unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
+/* max discard pend list number */
+#define MAX_PLIST_NUM 512
+#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
+ (MAX_PLIST_NUM - 1) : (blk_num - 1))
+
enum {
D_PREP,
D_SUBMIT,
D_DONE,
};
+struct discard_info {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+};
+
struct discard_cmd {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ union {
+ struct {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+ };
+ struct discard_info di; /* discard info */
+
+ };
struct list_head list; /* command list */
struct completion wait; /* compleation */
- block_t lstart; /* logical start address */
- block_t len; /* length */
- struct bio *bio; /* bio */
- int state; /* state */
+ struct block_device *bdev; /* bdev */
+ unsigned short ref; /* reference count */
+ unsigned char state; /* state */
+ int error; /* bio error */
};
struct discard_cmd_control {
struct task_struct *f2fs_issue_discard; /* discard thread */
- struct list_head discard_entry_list; /* 4KB discard entry list */
- int nr_discards; /* # of discards in the list */
- struct list_head discard_cmd_list; /* discard cmd list */
+ struct list_head entry_list; /* 4KB discard entry list */
+ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+ struct list_head wait_list; /* store on-flushing entries */
wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */
struct mutex cmd_lock;
- int max_discards; /* max. discards to be issued */
- atomic_t submit_discard; /* # of issued discard */
+ unsigned int nr_discards; /* # of discards in the list */
+ unsigned int max_discards; /* max. discards to be issued */
+ unsigned int undiscard_blks; /* # of undiscard blocks */
+ atomic_t issued_discard; /* # of issued discard */
+ atomic_t issing_discard; /* # of issing discard */
+ atomic_t discard_cmd_cnt; /* # of cached cmd count */
+ struct rb_root root; /* root of discard rb-tree */
};
/* for the list of fsync inodes, used only during recovery */
@@ -222,13 +247,13 @@ struct fsync_inode_entry {
block_t last_dentry; /* block address locating the last dentry */
};
-#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
-#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits))
+#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats))
+#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits))
-#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne)
-#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid)
-#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se)
-#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno)
#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
@@ -270,11 +295,14 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
-#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
-#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
+ struct f2fs_defragment)
#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
struct f2fs_move_range)
+#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
+ struct f2fs_flush_device)
#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -311,6 +339,11 @@ struct f2fs_move_range {
u64 len; /* size to move */
};
+struct f2fs_flush_device {
+ u32 dev_num; /* device number to flush */
+ u32 segments; /* # of segments to flush */
+};
+
/*
* For INODE and NODE manager
*/
@@ -323,26 +356,24 @@ struct f2fs_dentry_ptr {
int max;
};
-static inline void make_dentry_ptr(struct inode *inode,
- struct f2fs_dentry_ptr *d, void *src, int type)
+static inline void make_dentry_ptr_block(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t)
{
d->inode = inode;
+ d->max = NR_DENTRY_IN_BLOCK;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+}
- if (type == 1) {
- struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
-
- d->max = NR_DENTRY_IN_BLOCK;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
- } else {
- struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
-
- d->max = NR_INLINE_DENTRY;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
- }
+static inline void make_dentry_ptr_inline(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t)
+{
+ d->inode = inode;
+ d->max = NR_INLINE_DENTRY;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
}
/*
@@ -374,16 +405,30 @@ enum {
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+struct rb_entry {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ unsigned int ofs; /* start offset of the entry */
+ unsigned int len; /* length of the entry */
+};
+
struct extent_info {
unsigned int fofs; /* start offset in a file */
- u32 blk; /* start block address of the extent */
unsigned int len; /* length of the extent */
+ u32 blk; /* start block address of the extent */
};
struct extent_node {
- struct rb_node rb_node; /* rb node located in rb-tree */
+ struct rb_node rb_node;
+ union {
+ struct {
+ unsigned int fofs;
+ unsigned int len;
+ u32 blk;
+ };
+ struct extent_info ei; /* extent info */
+
+ };
struct list_head list; /* node in global extent list of sbi */
- struct extent_info ei; /* extent info */
struct extent_tree *et; /* extent tree pointer */
};
@@ -500,6 +545,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
ei->len = len;
}
+static inline bool __is_discard_mergeable(struct discard_info *back,
+ struct discard_info *front)
+{
+ return back->lstart + back->len == front->lstart;
+}
+
+static inline bool __is_discard_back_mergeable(struct discard_info *cur,
+ struct discard_info *back)
+{
+ return __is_discard_mergeable(back, cur);
+}
+
+static inline bool __is_discard_front_mergeable(struct discard_info *cur,
+ struct discard_info *front)
+{
+ return __is_discard_mergeable(cur, front);
+}
+
static inline bool __is_extent_mergeable(struct extent_info *back,
struct extent_info *front)
{
@@ -562,7 +625,6 @@ struct f2fs_nm_info {
unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
unsigned char *nat_block_bitmap;
unsigned short *free_nid_count; /* free nid count of NAT block */
- spinlock_t free_nid_lock; /* protect updating of nid count */
/* for checkpoint */
char *nat_bitmap; /* NAT bitmap pointer */
@@ -641,7 +703,8 @@ struct flush_cmd {
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
- atomic_t submit_flush; /* # of issued flushes */
+ atomic_t issued_flush; /* # of issued flushes */
+ atomic_t issing_flush; /* # of issing flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -672,6 +735,7 @@ struct f2fs_sm_info {
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
unsigned int min_fsync_blocks; /* threshold for fsync */
+ unsigned int min_hot_blocks; /* threshold for hot block allocation */
/* for flush command control */
struct flush_cmd_control *fcc_info;
@@ -722,6 +786,7 @@ enum page_type {
META_FLUSH,
INMEM, /* the below types are used by tracepoints only. */
INMEM_DROP,
+ INMEM_INVALIDATE,
INMEM_REVOKE,
IPU,
OPU,
@@ -737,9 +802,10 @@ struct f2fs_io_info {
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
bool submitted; /* indicate IO submission */
+ bool need_lock; /* indicate we need to lock cp_rwsem */
};
-#define is_read_io(rw) (rw == READ)
+#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
struct bio *bio; /* bios to merge */
@@ -827,6 +893,7 @@ struct f2fs_sb_info {
struct mutex cp_mutex; /* checkpoint procedure lock */
struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
+ struct rw_semaphore node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -879,6 +946,9 @@ struct f2fs_sb_info {
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* writeback control */
+ atomic_t wb_sync_req; /* count # of WB_SYNC threads */
+
/* valid inode count */
struct percpu_counter total_valid_inode_count;
@@ -912,11 +982,12 @@ struct f2fs_sb_info {
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t aw_cnt; /* # of atomic writes */
+ atomic_t vw_cnt; /* # of volatile writes */
atomic_t max_aw_cnt; /* max # of atomic writes */
+ atomic_t max_vw_cnt; /* max # of volatile writes */
int bg_gc; /* background gc calls */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
- unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
/* For sysfs suppport */
@@ -971,8 +1042,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
* and the return value is in kbytes. s is of struct f2fs_sb_info.
*/
#define BD_PART_WRITTEN(s) \
-(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \
- s->sectors_written_start) >> 1)
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \
+ (s)->sectors_written_start) >> 1)
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
@@ -1193,7 +1264,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
{
bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set;
+ return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
}
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
@@ -1229,7 +1300,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
static inline bool __remain_node_summaries(int reason)
{
- return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+ return (reason & (CP_UMOUNT | CP_FASTBOOT));
}
static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
@@ -1707,6 +1778,7 @@ enum {
FI_DO_DEFRAG, /* indicate defragment is running */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_HOT_DATA, /* indicate file is hot */
};
static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -1869,12 +1941,6 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DATA);
}
-static inline void f2fs_clear_inline_inode(struct inode *inode)
-{
- clear_inode_flag(inode, FI_INLINE_DATA);
- clear_inode_flag(inode, FI_DATA_EXIST);
-}
-
static inline int f2fs_exist_data(struct inode *inode)
{
return is_inode_flag_set(inode, FI_DATA_EXIST);
@@ -2005,36 +2071,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
return kmalloc(size, flags);
}
-static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags, PAGE_KERNEL);
- return ret;
-}
-
-static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kzalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
- return ret;
-}
-
#define get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
-/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \
- ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \
- (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \
- ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
-
/*
* file.c
*/
@@ -2096,8 +2136,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
struct page **page);
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode);
-int update_dent_inode(struct inode *inode, struct inode *to,
- const struct qstr *name);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
const struct qstr *name, f2fs_hash_t name_hash,
unsigned int bit_pos);
@@ -2133,7 +2171,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
*/
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info);
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+ struct fscrypt_name *fname);
/*
* node.c
@@ -2184,6 +2223,7 @@ void destroy_node_manager_caches(void);
*/
void register_inmem_page(struct inode *inode, struct page *page);
void drop_inmem_pages(struct inode *inode);
+void drop_inmem_page(struct inode *inode, struct page *page);
int commit_inmem_pages(struct inode *inode);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
@@ -2193,7 +2233,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void release_discard_addrs(struct f2fs_sb_info *sbi);
int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2205,7 +2245,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr);
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page);
void write_node_page(unsigned int nid, struct f2fs_io_info *fio);
void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio);
-void rewrite_data_page(struct f2fs_io_info *fio);
+int rewrite_data_page(struct f2fs_io_info *fio);
void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
bool recover_curseg, bool recover_newaddr);
@@ -2310,7 +2350,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
int start_gc_thread(struct f2fs_sb_info *sbi);
void stop_gc_thread(struct f2fs_sb_info *sbi);
block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background);
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+ unsigned int segno);
void build_gc_manager(struct f2fs_sb_info *sbi);
/*
@@ -2334,11 +2375,15 @@ struct f2fs_stat_info {
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
int inmem_pages;
unsigned int ndirty_dirs, ndirty_files, ndirty_all;
- int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
+ int nats, dirty_nats, sits, dirty_sits;
+ int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard;
+ int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+ int nr_discard_cmd;
+ unsigned int undiscard_blks;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
- int aw_cnt, max_aw_cnt;
+ int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -2421,11 +2466,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
} while (0)
+#define stat_inc_volatile_write(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_dec_volatile_write(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_update_max_volatile_write(inode) \
+ do { \
+ int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \
+ int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \
+ if (cur > max) \
+ atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
+ } while (0)
#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
- (si)->tot_segs++; \
- if (type == SUM_TYPE_DATA) { \
+ si->tot_segs++; \
+ if ((type) == SUM_TYPE_DATA) { \
si->data_segs++; \
si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
} else { \
@@ -2435,14 +2491,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
} while (0)
#define stat_inc_tot_blk_count(si, blks) \
- (si->tot_blks += (blks))
+ ((si)->tot_blks += (blks))
#define stat_inc_data_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
- si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \
+ si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
#define stat_inc_node_blk_count(sbi, blks, gc_type) \
@@ -2450,7 +2506,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
- si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \
+ si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
int f2fs_build_stats(struct f2fs_sb_info *sbi);
@@ -2458,32 +2514,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
-#define stat_inc_cp_count(si)
-#define stat_inc_bg_cp_count(si)
-#define stat_inc_call_count(si)
-#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_inode(sbi, type)
-#define stat_dec_dirty_inode(sbi, type)
-#define stat_inc_total_hit(sb)
-#define stat_inc_rbtree_node_hit(sb)
-#define stat_inc_largest_node_hit(sbi)
-#define stat_inc_cached_node_hit(sbi)
-#define stat_inc_inline_xattr(inode)
-#define stat_dec_inline_xattr(inode)
-#define stat_inc_inline_inode(inode)
-#define stat_dec_inline_inode(inode)
-#define stat_inc_inline_dir(inode)
-#define stat_dec_inline_dir(inode)
-#define stat_inc_atomic_write(inode)
-#define stat_dec_atomic_write(inode)
-#define stat_update_max_atomic_write(inode)
-#define stat_inc_seg_type(sbi, curseg)
-#define stat_inc_block_count(sbi, curseg)
-#define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(sbi, type, gc_type)
-#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(sbi, blks, gc_type)
-#define stat_inc_node_blk_count(sbi, blks, gc_type)
+#define stat_inc_cp_count(si) do { } while (0)
+#define stat_inc_bg_cp_count(si) do { } while (0)
+#define stat_inc_call_count(si) do { } while (0)
+#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_inc_dirty_inode(sbi, type) do { } while (0)
+#define stat_dec_dirty_inode(sbi, type) do { } while (0)
+#define stat_inc_total_hit(sb) do { } while (0)
+#define stat_inc_rbtree_node_hit(sb) do { } while (0)
+#define stat_inc_largest_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_inline_xattr(inode) do { } while (0)
+#define stat_dec_inline_xattr(inode) do { } while (0)
+#define stat_inc_inline_inode(inode) do { } while (0)
+#define stat_dec_inline_inode(inode) do { } while (0)
+#define stat_inc_inline_dir(inode) do { } while (0)
+#define stat_dec_inline_dir(inode) do { } while (0)
+#define stat_inc_atomic_write(inode) do { } while (0)
+#define stat_dec_atomic_write(inode) do { } while (0)
+#define stat_update_max_atomic_write(inode) do { } while (0)
+#define stat_inc_volatile_write(inode) do { } while (0)
+#define stat_dec_volatile_write(inode) do { } while (0)
+#define stat_update_max_volatile_write(inode) do { } while (0)
+#define stat_inc_seg_type(sbi, curseg) do { } while (0)
+#define stat_inc_block_count(sbi, curseg) do { } while (0)
+#define stat_inc_inplace_blocks(sbi) do { } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0)
+#define stat_inc_tot_blk_count(si, blks) do { } while (0)
+#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0)
+#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0)
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -2509,7 +2568,7 @@ extern struct kmem_cache *inode_entry_slab;
bool f2fs_may_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
void read_inline_data(struct page *page, struct page *ipage);
-bool truncate_inline_inode(struct page *ipage, u64 from);
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from);
int f2fs_read_inline_data(struct inode *inode, struct page *page);
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
@@ -2544,6 +2603,18 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs);
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs,
+ struct rb_entry **prev_entry, struct rb_entry **next_entry,
+ struct rb_node ***insert_p, struct rb_node **insert_parent,
+ bool force);
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root);
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
void f2fs_drop_extent_tree(struct inode *inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5f7317875a67..61af721329fa 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -116,11 +116,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- if (update_dent_inode(inode, inode, &dentry->d_name)) {
- dput(dentry);
- return 0;
- }
-
*pino = parent_ino(dentry);
dput(dentry);
return 1;
@@ -528,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- return 0;
+ return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
truncate_out:
f2fs_wait_on_page_writeback(page, DATA, true);
zero_user(page, offset, PAGE_SIZE - offset);
@@ -566,9 +561,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
}
if (f2fs_has_inline_data(inode)) {
- truncate_inline_inode(ipage, from);
- if (from == 0)
- clear_inode_flag(inode, FI_DATA_EXIST);
+ truncate_inline_inode(inode, ipage, from);
f2fs_put_page(ipage, 1);
truncate_page = true;
goto out;
@@ -617,6 +610,12 @@ int f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
+ f2fs_show_injection_info(FAULT_TRUNCATE);
+ return -EIO;
+ }
+#endif
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
@@ -1012,11 +1011,11 @@ static int __exchange_data_block(struct inode *src_inode,
while (len) {
olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
- src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
if (!src_blkaddr)
return -ENOMEM;
- do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL);
if (!do_replace) {
kvfree(src_blkaddr);
return -ENOMEM;
@@ -1188,8 +1187,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- if (offset + len > new_size)
- new_size = offset + len;
new_size = max_t(loff_t, new_size, offset + len);
} else {
if (off_start) {
@@ -1257,8 +1254,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
int ret = 0;
new_size = i_size_read(inode) + len;
- if (new_size > inode->i_sb->s_maxbytes)
- return -EFBIG;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ return ret;
if (offset >= i_size_read(inode))
return -EINVAL;
@@ -1428,6 +1426,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
set_inode_flag(inode, FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
clear_inode_flag(inode, FI_DROP_CACHE);
@@ -1474,10 +1473,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
if (ret)
return ret;
- flags = f2fs_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
oldflags = fi->i_flags;
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -1491,10 +1490,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = flags & FS_FL_USER_MODIFIABLE;
flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
fi->i_flags = flags;
- inode_unlock(inode);
inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
+
+ inode_unlock(inode);
out:
mnt_drop_write_file(filp);
return ret;
@@ -1515,6 +1515,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -1529,20 +1532,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
goto out;
set_inode_flag(inode, FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_HOT_DATA);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
if (!get_dirty_pages(inode))
- goto out;
+ goto inc_stat;
f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
"Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
- if (ret)
+ if (ret) {
clear_inode_flag(inode, FI_ATOMIC_FILE);
-out:
+ goto out;
+ }
+
+inc_stat:
stat_inc_atomic_write(inode);
stat_update_max_atomic_write(inode);
+out:
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -1592,6 +1600,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -1605,6 +1616,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (ret)
goto out;
+ stat_inc_volatile_write(inode);
+ stat_update_max_volatile_write(inode);
+
set_inode_flag(inode, FI_VOLATILE_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
out:
@@ -1660,6 +1674,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
}
@@ -1841,7 +1856,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
mutex_lock(&sbi->gc_mutex);
}
- ret = f2fs_gc(sbi, sync, true);
+ ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
out:
mnt_drop_write_file(filp);
return ret;
@@ -1879,13 +1894,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
pgoff_t pg_start, pg_end;
unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
- unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
block_t blk_end = 0;
bool fragmented = false;
int err;
/* if in-place-update policy is enabled, don't waste time here */
- if (need_inplace_update(inode))
+ if (need_inplace_update_policy(inode, NULL))
return -EINVAL;
pg_start = range->start >> PAGE_SHIFT;
@@ -1943,7 +1957,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
map.m_lblk = pg_start;
map.m_len = pg_end - pg_start;
- sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+ sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
/*
* make sure there are enough free section for LFS allocation, this can
@@ -2020,42 +2034,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
return -EINVAL;
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (f2fs_readonly(sbi->sb)) {
- err = -EROFS;
- goto out;
- }
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
- sizeof(range))) {
- err = -EFAULT;
- goto out;
- }
+ sizeof(range)))
+ return -EFAULT;
/* verify alignment of offset & size */
- if (range.start & (F2FS_BLKSIZE - 1) ||
- range.len & (F2FS_BLKSIZE - 1)) {
- err = -EINVAL;
- goto out;
- }
+ if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ if (unlikely((range.start + range.len) >> PAGE_SHIFT >
+ sbi->max_file_blocks))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
err = f2fs_defragment_range(sbi, filp, &range);
+ mnt_drop_write_file(filp);
+
f2fs_update_time(sbi, REQ_TIME);
if (err < 0)
- goto out;
+ return err;
if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
sizeof(range)))
- err = -EFAULT;
-out:
- mnt_drop_write_file(filp);
- return err;
+ return -EFAULT;
+
+ return 0;
}
static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
@@ -2189,6 +2201,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
range.pos_out, range.len);
mnt_drop_write_file(filp);
+ if (err)
+ goto err_out;
if (copy_to_user((struct f2fs_move_range __user *)arg,
&range, sizeof(range)))
@@ -2198,6 +2212,69 @@ err_out:
return err;
}
+static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct sit_info *sm = SIT_I(sbi);
+ unsigned int start_segno = 0, end_segno = 0;
+ unsigned int dev_start_segno = 0, dev_end_segno = 0;
+ struct f2fs_flush_device range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+ sbi->segs_per_sec != 1) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Can't flush %u in %d for segs_per_sec %u != 1\n",
+ range.dev_num, sbi->s_ndevs,
+ sbi->segs_per_sec);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (range.dev_num != 0)
+ dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk);
+ dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk);
+
+ start_segno = sm->last_victim[FLUSH_DEVICE];
+ if (start_segno < dev_start_segno || start_segno >= dev_end_segno)
+ start_segno = dev_start_segno;
+ end_segno = min(start_segno + range.segments, dev_end_segno);
+
+ while (start_segno < end_segno) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sm->last_victim[GC_CB] = end_segno + 1;
+ sm->last_victim[GC_GREEDY] = end_segno + 1;
+ sm->last_victim[ALLOC_NEXT] = end_segno + 1;
+ ret = f2fs_gc(sbi, true, true, start_segno);
+ if (ret == -EAGAIN)
+ ret = 0;
+ else if (ret < 0)
+ break;
+ start_segno++;
+ }
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -2235,6 +2312,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_defragment(filp, arg);
case F2FS_IOC_MOVE_RANGE:
return f2fs_ioc_move_range(filp, arg);
+ case F2FS_IOC_FLUSH_DEVICE:
+ return f2fs_ioc_flush_device(filp, arg);
default:
return -ENOTTY;
}
@@ -2302,8 +2381,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_GARBAGE_COLLECT:
case F2FS_IOC_WRITE_CHECKPOINT:
case F2FS_IOC_DEFRAGMENT:
- break;
case F2FS_IOC_MOVE_RANGE:
+ case F2FS_IOC_FLUSH_DEVICE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 418fd9881646..026522107ca3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -84,7 +84,7 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- p->offset = sbi->last_victim[p->gc_mode];
+ /* let's select beginning hot/small space first */
+ if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ p->offset = 0;
+ else
+ p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
@@ -182,7 +186,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
if (p->alloc_mode == SSR)
return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return sbi->blocks_per_seg * p->ofs_unit;
+ return 2 * sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -207,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
continue;
clear_bit(secno, dirty_i->victim_secmap);
- return secno * sbi->segs_per_sec;
+ return GET_SEG_FROM_SEC(sbi, secno);
}
return NULL_SEGNO;
}
@@ -215,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int secno = GET_SECNO(sbi, segno);
- unsigned int start = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
@@ -225,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
@@ -248,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
unsigned int segno)
{
unsigned int valid_blocks =
- get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ get_valid_blocks(sbi, segno, true);
return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
valid_blocks * 2 : valid_blocks;
@@ -291,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
unsigned int secno, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
@@ -304,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_segno = NULL_SEGNO;
p.min_cost = get_max_cost(sbi, &p);
+ if (*result != NULL_SEGNO) {
+ if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
+ get_valid_blocks(sbi, *result, false) &&
+ !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+ p.min_segno = *result;
+ goto out;
+ }
+
if (p.max_search == 0)
goto out;
- last_victim = sbi->last_victim[p.gc_mode];
+ last_victim = sm->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -320,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
if (segno >= last_segment) {
- if (sbi->last_victim[p.gc_mode]) {
- last_segment = sbi->last_victim[p.gc_mode];
- sbi->last_victim[p.gc_mode] = 0;
+ if (sm->last_victim[p.gc_mode]) {
+ last_segment =
+ sm->last_victim[p.gc_mode];
+ sm->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
@@ -339,7 +353,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
nsearched++;
}
- secno = GET_SECNO(sbi, segno);
+ secno = GET_SEC_FROM_SEG(sbi, segno);
if (sec_usage_check(sbi, secno))
goto next;
@@ -357,17 +371,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
next:
if (nsearched >= p.max_search) {
- if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
- sbi->last_victim[p.gc_mode] = last_victim + 1;
+ if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
+ sm->last_victim[p.gc_mode] = last_victim + 1;
else
- sbi->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
break;
}
}
if (p.min_segno != NULL_SEGNO) {
got_it:
if (p.alloc_mode == LFS) {
- secno = GET_SECNO(sbi, p.min_segno);
+ secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
if (gc_type == FG_GC)
sbi->cur_victim_sec = secno;
else
@@ -550,8 +565,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
get_node_info(sbi, nid, dni);
if (sum->version != dni->version) {
- f2fs_put_page(node_page, 1);
- return false;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: valid data with mismatched node version.",
+ __func__);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
*nofs = ofs_of_node(node_page);
@@ -697,8 +714,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC,
+ .old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
+ .need_lock = true,
};
bool is_dirty = PageDirty(page);
int err;
@@ -890,7 +909,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
GET_SUM_BLOCK(sbi, segno));
f2fs_put_page(sum_page, 0);
- if (get_valid_blocks(sbi, segno, 1) == 0 ||
+ if (get_valid_blocks(sbi, segno, false) == 0 ||
!PageUptodate(sum_page) ||
unlikely(f2fs_cp_error(sbi)))
goto next;
@@ -905,7 +924,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
* - mutex_lock(sentry_lock) - change_curseg()
* - lock_page(sum_page)
*/
-
if (type == SUM_TYPE_NODE)
gc_node_segment(sbi, sum->entries, segno, gc_type);
else
@@ -924,7 +942,7 @@ next:
blk_finish_plug(&plug);
if (gc_type == FG_GC &&
- get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
+ get_valid_blocks(sbi, start_segno, true) == 0)
sec_freed = 1;
stat_inc_call_count(sbi->stat_info);
@@ -932,13 +950,14 @@ next:
return sec_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
+ bool background, unsigned int segno)
{
- unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
int sec_freed = 0;
int ret = -EINVAL;
struct cp_control cpc;
+ unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(GFP_NOFS),
@@ -959,9 +978,11 @@ gc_more:
* threshold, we can make them free by checkpoint. Then, we
* secure free segments which doesn't need fggc any more.
*/
- ret = write_checkpoint(sbi, &cpc);
- if (ret)
- goto stop;
+ if (prefree_segments(sbi)) {
+ ret = write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
+ }
if (has_not_enough_free_secs(sbi, 0, 0))
gc_type = FG_GC;
}
@@ -981,13 +1002,17 @@ gc_more:
sbi->cur_victim_sec = NULL_SEGNO;
if (!sync) {
- if (has_not_enough_free_secs(sbi, sec_freed, 0))
+ if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+ segno = NULL_SEGNO;
goto gc_more;
+ }
if (gc_type == FG_GC)
ret = write_checkpoint(sbi, &cpc);
}
stop:
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
+ SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
@@ -999,7 +1024,7 @@ stop:
void build_gc_manager(struct f2fs_sb_info *sbi)
{
- u64 main_count, resv_count, ovp_count, blocks_per_sec;
+ u64 main_count, resv_count, ovp_count;
DIRTY_I(sbi)->v_ops = &default_v_ops;
@@ -1007,8 +1032,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg;
resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg;
ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
- blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec;
- sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec,
- (main_count - resv_count));
+ sbi->fggc_threshold = div64_u64((main_count - ovp_count) *
+ BLKS_PER_SEC(sbi), (main_count - resv_count));
+
+ /* give warm/cold data area from slower device */
+ if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] =
+ GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 71b7206c431e..eb2e031ea887 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len,
*buf++ = pad;
}
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+ struct fscrypt_name *fname)
{
__u32 hash;
f2fs_hash_t f2fs_hash;
@@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
const unsigned char *name = name_info->name;
size_t len = name_info->len;
+ /* encrypted bigname case */
+ if (fname && !fname->disk_name.name)
+ return cpu_to_le32(fname->hash);
+
if (is_dot_dotdot(name_info))
return 0;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index e32a9e527968..e4c527c4e7d0 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage)
SetPageUptodate(page);
}
-bool truncate_inline_inode(struct page *ipage, u64 from)
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
{
void *addr;
if (from >= MAX_INLINE_DATA)
- return false;
+ return;
addr = inline_data_addr(ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true);
memset(addr + from, 0, MAX_INLINE_DATA - from);
set_page_dirty(ipage);
- return true;
+
+ if (from == 0)
+ clear_inode_flag(inode, FI_DATA_EXIST);
}
int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -135,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
/* write data page to try to make data consistent */
set_page_writeback(page);
fio.old_blkaddr = dn->data_blkaddr;
+ set_inode_flag(dn->inode, FI_HOT_DATA);
write_data_page(dn, &fio);
f2fs_wait_on_page_writeback(page, DATA, true);
if (dirty) {
@@ -146,11 +149,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
set_inode_flag(dn->inode, FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
- truncate_inline_inode(dn->inode_page, 0);
+ truncate_inline_inode(dn->inode, dn->inode_page, 0);
clear_inline_node(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
- f2fs_clear_inline_inode(dn->inode);
+ clear_inode_flag(dn->inode, FI_INLINE_DATA);
f2fs_put_dnode(dn);
return 0;
}
@@ -267,9 +270,8 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- if (!truncate_inline_inode(ipage, 0))
- return false;
- f2fs_clear_inline_inode(inode);
+ truncate_inline_inode(inode, ipage, 0);
+ clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
if (truncate_blocks(inode, 0, false))
@@ -296,11 +298,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
return NULL;
}
- namehash = f2fs_dentry_hash(&name);
+ namehash = f2fs_dentry_hash(&name, fname);
inline_dentry = inline_data_addr(ipage);
- make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(NULL, &d, inline_dentry);
de = find_target_dentry(fname, namehash, NULL, &d);
unlock_page(ipage);
if (de)
@@ -319,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
dentry_blk = inline_data_addr(ipage);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+ make_dentry_ptr_inline(NULL, &d, dentry_blk);
do_make_empty_dir(inode, parent, &d);
set_page_dirty(ipage);
@@ -380,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
- truncate_inline_inode(ipage, 0);
+ truncate_inline_inode(dir, ipage, 0);
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -400,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
unsigned long bit_pos = 0;
int err = 0;
- make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(NULL, &d, inline_dentry);
while (bit_pos < d.max) {
struct f2fs_dir_entry *de;
@@ -455,7 +457,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
}
memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
- truncate_inline_inode(ipage, 0);
+ truncate_inline_inode(dir, ipage, 0);
unlock_page(ipage);
@@ -527,14 +529,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
err = PTR_ERR(page);
goto fail;
}
- if (f2fs_encrypted_inode(dir))
- file_set_enc_name(inode);
}
f2fs_wait_on_page_writeback(ipage, NODE, true);
- name_hash = f2fs_dentry_hash(new_name);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+ name_hash = f2fs_dentry_hash(new_name, NULL);
+ make_dentry_ptr_inline(NULL, &d, dentry_blk);
f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
set_page_dirty(ipage);
@@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
inline_dentry = inline_data_addr(ipage);
- make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
err = f2fs_fill_dentries(ctx, &d, 0, fstr);
if (!err)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 24bb8213d974..518f49643092 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -316,7 +316,6 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi, false);
}
- f2fs_inode_synced(inode);
return 0;
}
ret = update_inode(inode, node_page);
@@ -339,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- if (update_inode_page(inode) && wbc && wbc->nr_to_write)
+ update_inode_page(inode);
+ if (wbc && wbc->nr_to_write)
f2fs_balance_fs(sbi, true);
return 0;
}
@@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
- f2fs_show_injection_info(FAULT_EVICT_INODE);
- goto no_delete;
- }
-#endif
-
remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
@@ -389,6 +382,12 @@ retry:
if (F2FS_HAS_BLOCKS(inode))
err = f2fs_truncate(inode);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
+ f2fs_show_injection_info(FAULT_EVICT_INODE);
+ err = -EIO;
+ }
+#endif
if (!err) {
f2fs_lock_op(sbi);
err = remove_inode_page(inode);
@@ -411,7 +410,10 @@ no_delete:
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
- invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+ /* ino == 0, if f2fs_new_inode() was failed t*/
+ if (inode->i_ino)
+ invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
+ inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
if (inode->i_nlink) {
@@ -448,6 +450,7 @@ void handle_failed_inode(struct inode *inode)
* in a panic when flushing dirty inodes in gdirty_list.
*/
update_inode_page(inode);
+ f2fs_inode_synced(inode);
/* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 98f00a3a7f50..c31b40e5f9cf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
handle_failed_inode(inode);
@@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (f2fs_encrypted_inode(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
- bool nokey = f2fs_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode);
- err = nokey ? -ENOKEY : -EPERM;
+ f2fs_msg(inode->i_sb, KERN_WARNING,
+ "Inconsistent encryption contexts: %lu/%lu",
+ dir->i_ino, inode->i_ino);
+ err = -EPERM;
goto err_out;
}
return d_splice_alias(inode, dentry);
@@ -423,8 +424,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -487,6 +486,8 @@ err_out:
}
kfree(sd);
+
+ f2fs_balance_fs(sbi, true);
return err;
out:
handle_failed_inode(inode);
@@ -508,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->a_ops = &f2fs_dblock_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
- f2fs_balance_fs(sbi, true);
-
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
@@ -524,6 +523,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out_fail:
@@ -554,8 +555,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -569,6 +568,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
handle_failed_inode(inode);
@@ -595,8 +596,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err)
@@ -622,6 +621,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
/* link_count was changed by d_tmpfile as well. */
f2fs_unlock_op(sbi);
unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi, true);
return 0;
release_out:
@@ -720,13 +721,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto put_out_dir;
- err = update_dent_inode(old_inode, new_inode,
- &new_dentry->d_name);
- if (err) {
- release_orphan_inode(sbi);
- goto put_out_dir;
- }
-
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_inode->i_ctime = current_time(new_inode);
@@ -779,8 +773,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
down_write(&F2FS_I(old_inode)->i_sem);
file_lost_pino(old_inode);
- if (new_inode && file_enc_name(new_inode))
- file_set_enc_name(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
@@ -908,8 +900,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old_nlink = old_dir_entry ? -1 : 1;
new_nlink = -old_nlink;
err = -EMLINK;
- if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
- (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+ if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) ||
+ (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX))
goto out_new_dir;
}
@@ -917,18 +909,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_lock_op(sbi);
- err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
- if (err)
- goto out_unlock;
- if (file_enc_name(new_inode))
- file_set_enc_name(old_inode);
-
- err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
- if (err)
- goto out_undo;
- if (file_enc_name(old_inode))
- file_set_enc_name(new_inode);
-
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
@@ -972,14 +952,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
return 0;
-out_undo:
- /*
- * Still we may fail to recover name info of f2fs_inode here
- * Drop it, once its name is set as encrypted
- */
- update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
-out_unlock:
- f2fs_unlock_op(sbi);
out_new_dir:
if (new_dir_entry) {
f2fs_dentry_kunmap(new_inode, new_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 481aa8dc79f4..4547c5c5cd98 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -22,7 +22,7 @@
#include "trace.h"
#include <trace/events/f2fs.h>
-#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
@@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
int i;
for (i = 0; i <= UPDATE_INO; i++)
- mem_size += (sbi->im[i].ino_num *
- sizeof(struct ino_entry)) >> PAGE_SHIFT;
+ mem_size += sbi->im[i].ino_num *
+ sizeof(struct ino_entry);
+ mem_size >>= PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
mem_size = (atomic_read(&sbi->total_ext_tree) *
@@ -177,18 +178,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
}
static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+ struct nat_entry_set *set, struct nat_entry *ne)
{
- nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
- struct nat_entry_set *head;
-
- head = radix_tree_lookup(&nm_i->nat_set_root, set);
- if (head) {
- list_move_tail(&ne->list, &nm_i->nat_entries);
- set_nat_flag(ne, IS_DIRTY, false);
- head->entry_cnt--;
- nm_i->dirty_nat_cnt--;
- }
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ set->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -381,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
struct page *page = NULL;
struct f2fs_nat_entry ne;
struct nat_entry *e;
+ pgoff_t index;
int i;
ni->nid = nid;
@@ -406,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
}
up_read(&curseg->journal_rwsem);
- if (i >= 0)
+ if (i >= 0) {
+ up_read(&nm_i->nat_tree_lock);
goto cache;
+ }
/* Fill node_info from nat page */
- page = get_current_nat_page(sbi, start_nid);
+ index = current_nat_addr(sbi, nid);
+ up_read(&nm_i->nat_tree_lock);
+
+ page = get_meta_page(sbi, index);
nat_blk = (struct f2fs_nat_block *)page_address(page);
ne = nat_blk->entries[nid - start_nid];
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
- up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
down_write(&nm_i->nat_tree_lock);
cache_nat_entry(sbi, nid, &ne);
@@ -1463,6 +1463,9 @@ continue_unlock:
f2fs_wait_on_page_writeback(page, NODE, true);
BUG_ON(PageWriteback(page));
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
+
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
if (IS_INODE(page)) {
@@ -1766,40 +1769,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct free_nid *i;
+ struct free_nid *i, *e;
struct nat_entry *ne;
- int err;
+ int err = -EINVAL;
+ bool ret = false;
/* 0 nid should not be used */
if (unlikely(nid == 0))
return false;
- if (build) {
- /* do not add allocated nids */
- ne = __lookup_nat_cache(nm_i, nid);
- if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
- nat_get_blkaddr(ne) != NULL_ADDR))
- return false;
- }
-
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
i->nid = nid;
i->state = NID_NEW;
- if (radix_tree_preload(GFP_NOFS)) {
- kmem_cache_free(free_nid_slab, i);
- return true;
- }
+ if (radix_tree_preload(GFP_NOFS))
+ goto err;
spin_lock(&nm_i->nid_list_lock);
+
+ if (build) {
+ /*
+ * Thread A Thread B
+ * - f2fs_create
+ * - f2fs_new_inode
+ * - alloc_nid
+ * - __insert_nid_to_list(ALLOC_NID_LIST)
+ * - f2fs_balance_fs_bg
+ * - build_free_nids
+ * - __build_free_nids
+ * - scan_nat_page
+ * - add_free_nid
+ * - __lookup_nat_cache
+ * - f2fs_add_link
+ * - init_inode_metadata
+ * - new_inode_page
+ * - new_node_page
+ * - set_node_addr
+ * - alloc_nid_done
+ * - __remove_nid_from_list(ALLOC_NID_LIST)
+ * - __insert_nid_to_list(FREE_NID_LIST)
+ */
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ nat_get_blkaddr(ne) != NULL_ADDR))
+ goto err_out;
+
+ e = __lookup_free_nid_list(nm_i, nid);
+ if (e) {
+ if (e->state == NID_NEW)
+ ret = true;
+ goto err_out;
+ }
+ }
+ ret = true;
err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+err_out:
spin_unlock(&nm_i->nid_list_lock);
radix_tree_preload_end();
- if (err) {
+err:
+ if (err)
kmem_cache_free(free_nid_slab, i);
- return true;
- }
- return true;
+ return ret;
}
static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
@@ -1821,7 +1851,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
}
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
- bool set, bool build, bool locked)
+ bool set, bool build)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
@@ -1835,14 +1865,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
else
__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
- if (!locked)
- spin_lock(&nm_i->free_nid_lock);
if (set)
nm_i->free_nid_count[nat_ofs]++;
else if (!build)
nm_i->free_nid_count[nat_ofs]--;
- if (!locked)
- spin_unlock(&nm_i->free_nid_lock);
}
static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1871,7 +1897,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
if (blk_addr == NULL_ADDR)
freed = add_free_nid(sbi, start_nid, true);
- update_free_nid_bitmap(sbi, start_nid, freed, true, false);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ update_free_nid_bitmap(sbi, start_nid, freed, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
}
}
@@ -1927,6 +1955,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
int i = 0;
nid_t nid = nm_i->next_scan_nid;
+ if (unlikely(nid >= nm_i->max_nid))
+ nid = 0;
+
/* Enough entries */
if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
return;
@@ -2026,7 +2057,7 @@ retry:
__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
nm_i->available_nids--;
- update_free_nid_bitmap(sbi, *nid, false, false, false);
+ update_free_nid_bitmap(sbi, *nid, false, false);
spin_unlock(&nm_i->nid_list_lock);
return true;
@@ -2082,7 +2113,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
nm_i->available_nids++;
- update_free_nid_bitmap(sbi, nid, true, false, false);
+ update_free_nid_bitmap(sbi, nid, true, false);
spin_unlock(&nm_i->nid_list_lock);
@@ -2407,16 +2438,16 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
raw_nat_from_node_info(raw_ne, &ne->ni);
nat_reset_flag(ne);
- __clear_nat_cache_dirty(NM_I(sbi), ne);
+ __clear_nat_cache_dirty(NM_I(sbi), set, ne);
if (nat_get_blkaddr(ne) == NULL_ADDR) {
add_free_nid(sbi, nid, false);
spin_lock(&NM_I(sbi)->nid_list_lock);
NM_I(sbi)->available_nids++;
- update_free_nid_bitmap(sbi, nid, true, false, false);
+ update_free_nid_bitmap(sbi, nid, true, false);
spin_unlock(&NM_I(sbi)->nid_list_lock);
} else {
spin_lock(&NM_I(sbi)->nid_list_lock);
- update_free_nid_bitmap(sbi, nid, false, false, false);
+ update_free_nid_bitmap(sbi, nid, false, false);
spin_unlock(&NM_I(sbi)->nid_list_lock);
}
}
@@ -2428,10 +2459,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
- f2fs_bug_on(sbi, set->entry_cnt);
-
- radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- kmem_cache_free(nat_entry_set_slab, set);
+ /* Allow dirty nats by node block allocation in write_begin */
+ if (!set->entry_cnt) {
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
+ }
}
/*
@@ -2476,8 +2508,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
__flush_nat_entry_set(sbi, set, cpc);
up_write(&nm_i->nat_tree_lock);
-
- f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+ /* Allow dirty nats by node block allocation in write_begin */
}
static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
@@ -2541,10 +2572,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
nid = i * NAT_ENTRY_PER_BLOCK;
last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
- spin_lock(&nm_i->free_nid_lock);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
for (; nid < last_nid; nid++)
- update_free_nid_bitmap(sbi, nid, true, true, true);
- spin_unlock(&nm_i->free_nid_lock);
+ update_free_nid_bitmap(sbi, nid, true, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
}
for (i = 0; i < nm_i->nat_blocks; i++) {
@@ -2621,23 +2652,20 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks *
+ nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
if (!nm_i->free_nid_bitmap)
return -ENOMEM;
- nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8,
+ nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
GFP_KERNEL);
if (!nm_i->nat_block_bitmap)
return -ENOMEM;
- nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks *
+ nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
sizeof(unsigned short), GFP_KERNEL);
if (!nm_i->free_nid_count)
return -ENOMEM;
-
- spin_lock_init(&nm_i->free_nid_lock);
-
return 0;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 2f9603fa85a5..558048e33cf9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -9,10 +9,10 @@
* published by the Free Software Foundation.
*/
/* start node id of a node block dedicated to the given node id */
-#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
/* node block offset on the NAT area dedicated to the given start node id */
-#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK)
/* # of pages to perform synchronous readahead before building free nids */
#define FREE_NID_PAGES 8
@@ -62,16 +62,16 @@ struct nat_entry {
struct node_info ni; /* in-memory node information */
};
-#define nat_get_nid(nat) (nat->ni.nid)
-#define nat_set_nid(nat, n) (nat->ni.nid = n)
-#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
-#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
-#define nat_get_ino(nat) (nat->ni.ino)
-#define nat_set_ino(nat, i) (nat->ni.ino = i)
-#define nat_get_version(nat) (nat->ni.version)
-#define nat_set_version(nat, v) (nat->ni.version = v)
+#define nat_get_nid(nat) ((nat)->ni.nid)
+#define nat_set_nid(nat, n) ((nat)->ni.nid = (n))
+#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr)
+#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b))
+#define nat_get_ino(nat) ((nat)->ni.ino)
+#define nat_set_ino(nat, i) ((nat)->ni.ino = (i))
+#define nat_get_version(nat) ((nat)->ni.version)
+#define nat_set_version(nat, v) ((nat)->ni.version = (v))
-#define inc_node_version(version) (++version)
+#define inc_node_version(version) (++(version))
static inline void copy_node_info(struct node_info *dst,
struct node_info *src)
@@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
struct f2fs_nm_info *nm_i = NM_I(sbi);
pgoff_t block_off;
pgoff_t block_addr;
- int seg_off;
+ /*
+ * block_off = segment_off * 512 + off_in_segment
+ * OLD = (segment_off * 512) * 2 + off_in_segment
+ * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment
+ */
block_off = NAT_BLOCK_OFFSET(start);
- seg_off = block_off >> sbi->log_blocks_per_seg;
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
- (seg_off << sbi->log_blocks_per_seg << 1) +
+ (block_off << 1) -
(block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d025aa83fb5b..907d6b7dde6a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
-static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
+ bool check_only)
{
struct curseg_info *curseg;
struct page *page = NULL;
@@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry) {
- if (IS_INODE(page) && is_dent_dnode(page)) {
+ if (!check_only &&
+ IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
break;
@@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
mutex_lock(&sbi->cp_mutex);
/* step #1: find fsynced inode numbers */
- err = find_fsync_dnodes(sbi, &inode_list);
+ err = find_fsync_dnodes(sbi, &inode_list, check_only);
if (err || list_empty(&inode_list))
goto out;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 29ef7088c558..96845854e7ee 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode)
stat_dec_atomic_write(inode);
}
+void drop_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct list_head *head = &fi->inmem_pages;
+ struct inmem_pages *cur = NULL;
+
+ f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
+
+ mutex_lock(&fi->inmem_lock);
+ list_for_each_entry(cur, head, list) {
+ if (cur->page == page)
+ break;
+ }
+
+ f2fs_bug_on(sbi, !cur || cur->page != page);
+ list_del(&cur->list);
+ mutex_unlock(&fi->inmem_lock);
+
+ dec_page_count(sbi, F2FS_INMEM_PAGES);
+ kmem_cache_free(inmem_entry_slab, cur);
+
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+
+ trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
+}
+
static int __commit_inmem_pages(struct inode *inode,
struct list_head *revoke_list)
{
@@ -261,7 +291,6 @@ static int __commit_inmem_pages(struct inode *inode,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_PRIO,
- .encrypted_page = NULL,
};
pgoff_t last_idx = ULONG_MAX;
int err = 0;
@@ -281,6 +310,9 @@ static int __commit_inmem_pages(struct inode *inode,
}
fio.page = page;
+ fio.old_blkaddr = NULL_ADDR;
+ fio.encrypted_page = NULL;
+ fio.need_lock = false,
err = do_write_data_page(&fio);
if (err) {
unlock_page(page);
@@ -358,11 +390,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
}
#endif
- if (!need)
- return;
-
/* balance_fs_bg is able to be pending */
- if (excess_cached_nats(sbi))
+ if (need && excess_cached_nats(sbi))
f2fs_balance_fs_bg(sbi);
/*
@@ -371,7 +400,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi, false, false);
+ f2fs_gc(sbi, false, false, NULL_SEGNO);
}
}
@@ -390,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
else
build_free_nids(sbi, false, false);
- if (!is_idle(sbi))
+ if (!is_idle(sbi) && !excess_dirty_nats(sbi))
return;
/* checkpoint is the only way to shrink partial cached entries */
@@ -411,32 +440,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
}
}
-static int __submit_flush_wait(struct block_device *bdev)
+static int __submit_flush_wait(struct f2fs_sb_info *sbi,
+ struct block_device *bdev)
{
struct bio *bio = f2fs_bio_alloc(0);
int ret;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
bio->bi_bdev = bdev;
ret = submit_bio_wait(bio);
bio_put(bio);
+
+ trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
+ test_opt(sbi, FLUSH_MERGE), ret);
return ret;
}
static int submit_flush_wait(struct f2fs_sb_info *sbi)
{
- int ret = __submit_flush_wait(sbi->sb->s_bdev);
+ int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
int i;
- if (sbi->s_ndevs && !ret) {
- for (i = 1; i < sbi->s_ndevs; i++) {
- trace_f2fs_issue_flush(FDEV(i).bdev,
- test_opt(sbi, NOBARRIER),
- test_opt(sbi, FLUSH_MERGE));
- ret = __submit_flush_wait(FDEV(i).bdev);
- if (ret)
- break;
- }
+ if (!sbi->s_ndevs || ret)
+ return ret;
+
+ for (i = 1; i < sbi->s_ndevs; i++) {
+ ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+ if (ret)
+ break;
}
return ret;
}
@@ -458,6 +489,8 @@ repeat:
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
ret = submit_flush_wait(sbi);
+ atomic_inc(&fcc->issued_flush);
+
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
cmd->ret = ret;
@@ -475,25 +508,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
{
struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
struct flush_cmd cmd;
+ int ret;
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE))
- return submit_flush_wait(sbi);
-
- if (!atomic_read(&fcc->submit_flush)) {
- int ret;
+ if (!test_opt(sbi, FLUSH_MERGE)) {
+ ret = submit_flush_wait(sbi);
+ atomic_inc(&fcc->issued_flush);
+ return ret;
+ }
- atomic_inc(&fcc->submit_flush);
+ if (!atomic_read(&fcc->issing_flush)) {
+ atomic_inc(&fcc->issing_flush);
ret = submit_flush_wait(sbi);
- atomic_dec(&fcc->submit_flush);
+ atomic_dec(&fcc->issing_flush);
+
+ atomic_inc(&fcc->issued_flush);
return ret;
}
init_completion(&cmd.wait);
- atomic_inc(&fcc->submit_flush);
+ atomic_inc(&fcc->issing_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
@@ -501,10 +538,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (fcc->f2fs_issue_flush) {
wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->submit_flush);
+ atomic_dec(&fcc->issing_flush);
} else {
llist_del_all(&fcc->issue_list);
- atomic_set(&fcc->submit_flush, 0);
+ atomic_set(&fcc->issing_flush, 0);
}
return cmd.ret;
@@ -524,7 +561,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
- atomic_set(&fcc->submit_flush, 0);
+ atomic_set(&fcc->issued_flush, 0);
+ atomic_set(&fcc->issing_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->fcc_info = fcc;
@@ -597,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]--;
- if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
- clear_bit(GET_SECNO(sbi, segno),
+ if (get_valid_blocks(sbi, segno, true) == 0)
+ clear_bit(GET_SEC_FROM_SEG(sbi, segno),
dirty_i->victim_secmap);
}
}
@@ -618,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_lock(&dirty_i->seglist_lock);
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == 0) {
__locate_dirty_segment(sbi, segno, PRE);
@@ -633,162 +671,407 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_unlock(&dirty_i->seglist_lock);
}
-static void __add_discard_cmd(struct f2fs_sb_info *sbi,
- struct bio *bio, block_t lstart, block_t len)
+static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- struct list_head *cmd_list = &(dcc->discard_cmd_list);
+ struct list_head *pend_list;
struct discard_cmd *dc;
+ f2fs_bug_on(sbi, !len);
+
+ pend_list = &dcc->pend_list[plist_idx(len)];
+
dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
INIT_LIST_HEAD(&dc->list);
- dc->bio = bio;
- bio->bi_private = dc;
+ dc->bdev = bdev;
dc->lstart = lstart;
+ dc->start = start;
dc->len = len;
+ dc->ref = 0;
dc->state = D_PREP;
+ dc->error = 0;
init_completion(&dc->wait);
+ list_add_tail(&dc->list, pend_list);
+ atomic_inc(&dcc->discard_cmd_cnt);
+ dcc->undiscard_blks += len;
- mutex_lock(&dcc->cmd_lock);
- list_add_tail(&dc->list, cmd_list);
- mutex_unlock(&dcc->cmd_lock);
+ return dc;
}
-static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc)
+static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node *parent, struct rb_node **p)
{
- int err = dc->bio->bi_error;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
- if (dc->state == D_DONE)
- atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard));
+ dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
- if (err == -EOPNOTSUPP)
- err = 0;
+ rb_link_node(&dc->rb_node, parent, p);
+ rb_insert_color(&dc->rb_node, &dcc->root);
+
+ return dc;
+}
+
+static void __detach_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ if (dc->state == D_DONE)
+ atomic_dec(&dcc->issing_discard);
- if (err)
- f2fs_msg(sbi->sb, KERN_INFO,
- "Issue discard failed, ret: %d", err);
- bio_put(dc->bio);
list_del(&dc->list);
+ rb_erase(&dc->rb_node, &dcc->root);
+ dcc->undiscard_blks -= dc->len;
+
kmem_cache_free(discard_cmd_slab, dc);
+
+ atomic_dec(&dcc->discard_cmd_cnt);
}
-/* This should be covered by global mutex, &sit_i->sentry_lock */
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- struct list_head *wait_list = &(dcc->discard_cmd_list);
- struct discard_cmd *dc, *tmp;
- struct blk_plug plug;
- mutex_lock(&dcc->cmd_lock);
+ if (dc->error == -EOPNOTSUPP)
+ dc->error = 0;
- blk_start_plug(&plug);
+ if (dc->error)
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Issue discard failed, ret: %d", dc->error);
+ __detach_discard_cmd(dcc, dc);
+}
- list_for_each_entry_safe(dc, tmp, wait_list, list) {
+static void f2fs_submit_discard_endio(struct bio *bio)
+{
+ struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
- if (blkaddr == NULL_ADDR) {
- if (dc->state == D_PREP) {
- dc->state = D_SUBMIT;
- submit_bio(dc->bio);
- atomic_inc(&dcc->submit_discard);
- }
- continue;
+ dc->error = bio->bi_error;
+ dc->state = D_DONE;
+ complete(&dc->wait);
+ bio_put(bio);
+}
+
+/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
+static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct bio *bio = NULL;
+
+ if (dc->state != D_PREP)
+ return;
+
+ trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len);
+
+ dc->error = __blkdev_issue_discard(dc->bdev,
+ SECTOR_FROM_BLOCK(dc->start),
+ SECTOR_FROM_BLOCK(dc->len),
+ GFP_NOFS, 0, &bio);
+ if (!dc->error) {
+ /* should keep before submission to avoid D_DONE right away */
+ dc->state = D_SUBMIT;
+ atomic_inc(&dcc->issued_discard);
+ atomic_inc(&dcc->issing_discard);
+ if (bio) {
+ bio->bi_private = dc;
+ bio->bi_end_io = f2fs_submit_discard_endio;
+ bio->bi_opf |= REQ_SYNC;
+ submit_bio(bio);
+ list_move_tail(&dc->list, &dcc->wait_list);
}
+ } else {
+ __remove_discard_cmd(sbi, dc);
+ }
+}
- if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) {
- if (dc->state == D_SUBMIT)
- wait_for_completion_io(&dc->wait);
- else
- __remove_discard_cmd(sbi, dc);
+static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node **insert_p,
+ struct rb_node *insert_parent)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct rb_node **p = &dcc->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct discard_cmd *dc = NULL;
+
+ if (insert_p && insert_parent) {
+ parent = insert_parent;
+ p = insert_p;
+ goto do_insert;
+ }
+
+ p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+do_insert:
+ dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+ if (!dc)
+ return NULL;
+
+ return dc;
+}
+
+static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
+}
+
+static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_info di = dc->di;
+ bool modified = false;
+
+ if (dc->state == D_DONE || dc->len == 1) {
+ __remove_discard_cmd(sbi, dc);
+ return;
+ }
+
+ dcc->undiscard_blks -= di.len;
+
+ if (blkaddr > di.lstart) {
+ dc->len = blkaddr - dc->lstart;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+ modified = true;
+ }
+
+ if (blkaddr < di.lstart + di.len - 1) {
+ if (modified) {
+ __insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
+ di.start + blkaddr + 1 - di.lstart,
+ di.lstart + di.len - 1 - blkaddr,
+ NULL, NULL);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ } else {
+ dc->lstart++;
+ dc->len--;
+ dc->start++;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
}
}
- blk_finish_plug(&plug);
+}
- /* this comes from f2fs_put_super */
- if (blkaddr == NULL_ADDR) {
- list_for_each_entry_safe(dc, tmp, wait_list, list) {
- wait_for_completion_io(&dc->wait);
- __remove_discard_cmd(sbi, dc);
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+ struct discard_cmd *dc;
+ struct discard_info di = {0};
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ block_t end = lstart + len;
+
+ mutex_lock(&dcc->cmd_lock);
+
+ dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+ NULL, lstart,
+ (struct rb_entry **)&prev_dc,
+ (struct rb_entry **)&next_dc,
+ &insert_p, &insert_parent, true);
+ if (dc)
+ prev_dc = dc;
+
+ if (!prev_dc) {
+ di.lstart = lstart;
+ di.len = next_dc ? next_dc->lstart - lstart : len;
+ di.len = min(di.len, len);
+ di.start = start;
+ }
+
+ while (1) {
+ struct rb_node *node;
+ bool merged = false;
+ struct discard_cmd *tdc = NULL;
+
+ if (prev_dc) {
+ di.lstart = prev_dc->lstart + prev_dc->len;
+ if (di.lstart < lstart)
+ di.lstart = lstart;
+ if (di.lstart >= end)
+ break;
+
+ if (!next_dc || next_dc->lstart > end)
+ di.len = end - di.lstart;
+ else
+ di.len = next_dc->lstart - di.lstart;
+ di.start = start + di.lstart - lstart;
+ }
+
+ if (!di.len)
+ goto next;
+
+ if (prev_dc && prev_dc->state == D_PREP &&
+ prev_dc->bdev == bdev &&
+ __is_discard_back_mergeable(&di, &prev_dc->di)) {
+ prev_dc->di.len += di.len;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, prev_dc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ di = prev_dc->di;
+ tdc = prev_dc;
+ merged = true;
+ }
+
+ if (next_dc && next_dc->state == D_PREP &&
+ next_dc->bdev == bdev &&
+ __is_discard_front_mergeable(&di, &next_dc->di)) {
+ next_dc->di.lstart = di.lstart;
+ next_dc->di.len += di.len;
+ next_dc->di.start = di.start;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, next_dc);
+ if (tdc)
+ __remove_discard_cmd(sbi, tdc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ merged = true;
+ }
+
+ if (!merged) {
+ __insert_discard_tree(sbi, bdev, di.lstart, di.start,
+ di.len, NULL, NULL);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
}
+ next:
+ prev_dc = next_dc;
+ if (!prev_dc)
+ break;
+
+ node = rb_next(&prev_dc->rb_node);
+ next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
}
+
mutex_unlock(&dcc->cmd_lock);
}
-static void f2fs_submit_discard_endio(struct bio *bio)
+static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
{
- struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
+ block_t lblkstart = blkstart;
- complete(&dc->wait);
- dc->state = D_DONE;
+ trace_f2fs_queue_discard(bdev, blkstart, blklen);
+
+ if (sbi->s_ndevs) {
+ int devi = f2fs_target_device_index(sbi, blkstart);
+
+ blkstart -= FDEV(devi).start_blk;
+ }
+ __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
+ return 0;
}
-static int issue_discard_thread(void *data)
+static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
{
- struct f2fs_sb_info *sbi = data;
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- wait_queue_head_t *q = &dcc->discard_wait_queue;
- struct list_head *cmd_list = &dcc->discard_cmd_list;
+ struct list_head *pend_list;
struct discard_cmd *dc, *tmp;
struct blk_plug plug;
- int iter = 0;
-repeat:
- if (kthread_should_stop())
- return 0;
+ int i, iter = 0;
+ mutex_lock(&dcc->cmd_lock);
blk_start_plug(&plug);
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list) {
+ f2fs_bug_on(sbi, dc->state != D_PREP);
+
+ if (!issue_cond || is_idle(sbi))
+ __submit_discard_cmd(sbi, dc);
+ if (issue_cond && iter++ > DISCARD_ISSUE_RATE)
+ goto out;
+ }
+ }
+out:
+ blk_finish_plug(&plug);
+ mutex_unlock(&dcc->cmd_lock);
+}
+
+static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *wait_list = &(dcc->wait_list);
+ struct discard_cmd *dc, *tmp;
mutex_lock(&dcc->cmd_lock);
- list_for_each_entry_safe(dc, tmp, cmd_list, list) {
- if (dc->state == D_PREP) {
- dc->state = D_SUBMIT;
- submit_bio(dc->bio);
- atomic_inc(&dcc->submit_discard);
- if (iter++ > DISCARD_ISSUE_RATE)
- break;
- } else if (dc->state == D_DONE) {
+ list_for_each_entry_safe(dc, tmp, wait_list, list) {
+ if (!wait_cond || dc->state == D_DONE) {
+ if (dc->ref)
+ continue;
+ wait_for_completion_io(&dc->wait);
__remove_discard_cmd(sbi, dc);
}
}
mutex_unlock(&dcc->cmd_lock);
+}
- blk_finish_plug(&plug);
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
+ bool need_wait = false;
- iter = 0;
- congestion_wait(BLK_RW_SYNC, HZ/50);
+ mutex_lock(&dcc->cmd_lock);
+ dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr);
+ if (dc) {
+ if (dc->state == D_PREP) {
+ __punch_discard_cmd(sbi, dc, blkaddr);
+ } else {
+ dc->ref++;
+ need_wait = true;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
- wait_event_interruptible(*q,
- kthread_should_stop() || !list_empty(&dcc->discard_cmd_list));
- goto repeat;
+ if (need_wait) {
+ wait_for_completion_io(&dc->wait);
+ mutex_lock(&dcc->cmd_lock);
+ f2fs_bug_on(sbi, dc->state != D_DONE);
+ dc->ref--;
+ if (!dc->ref)
+ __remove_discard_cmd(sbi, dc);
+ mutex_unlock(&dcc->cmd_lock);
+ }
}
-
-/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
- struct block_device *bdev, block_t blkstart, block_t blklen)
+/* This comes from f2fs_put_super */
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
{
- struct bio *bio = NULL;
- block_t lblkstart = blkstart;
- int err;
+ __issue_discard_cmd(sbi, false);
+ __wait_discard_cmd(sbi, false);
+}
- trace_f2fs_issue_discard(bdev, blkstart, blklen);
+static int issue_discard_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ wait_queue_head_t *q = &dcc->discard_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
- if (sbi->s_ndevs) {
- int devi = f2fs_target_device_index(sbi, blkstart);
+ __issue_discard_cmd(sbi, true);
+ __wait_discard_cmd(sbi, true);
- blkstart -= FDEV(devi).start_blk;
- }
- err = __blkdev_issue_discard(bdev,
- SECTOR_FROM_BLOCK(blkstart),
- SECTOR_FROM_BLOCK(blklen),
- GFP_NOFS, 0, &bio);
- if (!err && bio) {
- bio->bi_end_io = f2fs_submit_discard_endio;
- bio->bi_opf |= REQ_SYNC;
+ congestion_wait(BLK_RW_SYNC, HZ/50);
- __add_discard_cmd(sbi, bio, lblkstart, blklen);
- wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
- }
- return err;
+ wait_event_interruptible(*q, kthread_should_stop() ||
+ atomic_read(&dcc->discard_cmd_cnt));
+ goto repeat;
}
#ifdef CONFIG_BLK_DEV_ZONED
@@ -796,6 +1079,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
sector_t sector, nr_sects;
+ block_t lblkstart = blkstart;
int devi = 0;
if (sbi->s_ndevs) {
@@ -813,7 +1097,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
case BLK_ZONE_TYPE_CONVENTIONAL:
if (!blk_queue_discard(bdev_get_queue(bdev)))
return 0;
- return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+ return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
case BLK_ZONE_TYPE_SEQWRITE_REQ:
case BLK_ZONE_TYPE_SEQWRITE_PREF:
sector = SECTOR_FROM_BLOCK(blkstart);
@@ -845,7 +1129,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
bdev_zoned_model(bdev) != BLK_ZONED_NONE)
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
- return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+ return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
}
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -888,32 +1172,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
return err;
}
-static void __add_discard_entry(struct f2fs_sb_info *sbi,
- struct cp_control *cpc, struct seg_entry *se,
- unsigned int start, unsigned int end)
-{
- struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list;
- struct discard_entry *new, *last;
-
- if (!list_empty(head)) {
- last = list_last_entry(head, struct discard_entry, list);
- if (START_BLOCK(sbi, cpc->trim_start) + start ==
- last->blkaddr + last->len &&
- last->len < MAX_DISCARD_BLOCKS(sbi)) {
- last->len += end - start;
- goto done;
- }
- }
-
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
- new->len = end - start;
- list_add_tail(&new->list, head);
-done:
- SM_I(sbi)->dcc_info->nr_discards += end - start;
-}
-
static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
bool check_only)
{
@@ -925,7 +1183,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
unsigned long *discard_map = (unsigned long *)se->discard_map;
unsigned long *dmap = SIT_I(sbi)->tmp_map;
unsigned int start = 0, end = -1;
- bool force = (cpc->reason == CP_DISCARD);
+ bool force = (cpc->reason & CP_DISCARD);
+ struct discard_entry *de = NULL;
+ struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
@@ -957,14 +1217,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
if (check_only)
return true;
- __add_discard_entry(sbi, cpc, se, start, end);
+ if (!de) {
+ de = f2fs_kmem_cache_alloc(discard_entry_slab,
+ GFP_F2FS_ZERO);
+ de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
+ list_add_tail(&de->list, head);
+ }
+
+ for (i = start; i < end; i++)
+ __set_bit_le(i, (void *)de->discard_map);
+
+ SM_I(sbi)->dcc_info->nr_discards += end - start;
}
return false;
}
void release_discard_addrs(struct f2fs_sb_info *sbi)
{
- struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
/* drop caches */
@@ -990,13 +1260,13 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
- struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
- bool force = (cpc->reason == CP_DISCARD);
+ bool force = (cpc->reason & CP_DISCARD);
mutex_lock(&dirty_i->seglist_lock);
@@ -1026,10 +1296,10 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
continue;
}
next:
- secno = GET_SECNO(sbi, start);
- start_segno = secno * sbi->segs_per_sec;
+ secno = GET_SEC_FROM_SEG(sbi, start);
+ start_segno = GET_SEG_FROM_SEC(sbi, secno);
if (!IS_CURSEC(sbi, secno) &&
- !get_valid_blocks(sbi, start, sbi->segs_per_sec))
+ !get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
sbi->segs_per_sec << sbi->log_blocks_per_seg);
@@ -1043,22 +1313,46 @@ next:
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
- if (force && entry->len < cpc->trim_minlen)
- goto skip;
- f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
- cpc->trimmed += entry->len;
+ unsigned int cur_pos = 0, next_pos, len, total_len = 0;
+ bool is_valid = test_bit_le(0, entry->discard_map);
+
+find_next:
+ if (is_valid) {
+ next_pos = find_next_zero_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ len = next_pos - cur_pos;
+
+ if (force && len < cpc->trim_minlen)
+ goto skip;
+
+ f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
+ len);
+ cpc->trimmed += len;
+ total_len += len;
+ } else {
+ next_pos = find_next_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ }
skip:
+ cur_pos = next_pos;
+ is_valid = !is_valid;
+
+ if (cur_pos < sbi->blocks_per_seg)
+ goto find_next;
+
list_del(&entry->list);
- SM_I(sbi)->dcc_info->nr_discards -= entry->len;
+ SM_I(sbi)->dcc_info->nr_discards -= total_len;
kmem_cache_free(discard_entry_slab, entry);
}
+
+ wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
}
static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct discard_cmd_control *dcc;
- int err = 0;
+ int err = 0, i;
if (SM_I(sbi)->dcc_info) {
dcc = SM_I(sbi)->dcc_info;
@@ -1069,12 +1363,18 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
if (!dcc)
return -ENOMEM;
- INIT_LIST_HEAD(&dcc->discard_entry_list);
- INIT_LIST_HEAD(&dcc->discard_cmd_list);
+ INIT_LIST_HEAD(&dcc->entry_list);
+ for (i = 0; i < MAX_PLIST_NUM; i++)
+ INIT_LIST_HEAD(&dcc->pend_list[i]);
+ INIT_LIST_HEAD(&dcc->wait_list);
mutex_init(&dcc->cmd_lock);
- atomic_set(&dcc->submit_discard, 0);
+ atomic_set(&dcc->issued_discard, 0);
+ atomic_set(&dcc->issing_discard, 0);
+ atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
- dcc->max_discards = 0;
+ dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->undiscard_blks = 0;
+ dcc->root = RB_ROOT;
init_waitqueue_head(&dcc->discard_wait_queue);
SM_I(sbi)->dcc_info = dcc;
@@ -1091,20 +1391,22 @@ init_thread:
return err;
}
-static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free)
+static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- if (dcc && dcc->f2fs_issue_discard) {
+ if (!dcc)
+ return;
+
+ if (dcc->f2fs_issue_discard) {
struct task_struct *discard_thread = dcc->f2fs_issue_discard;
dcc->f2fs_issue_discard = NULL;
kthread_stop(discard_thread);
}
- if (free) {
- kfree(dcc);
- SM_I(sbi)->dcc_info = NULL;
- }
+
+ kfree(dcc);
+ SM_I(sbi)->dcc_info = NULL;
}
static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -1345,6 +1647,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int segno = curseg->segno + 1;
+ struct free_segmap_info *free_i = FREE_I(sbi);
+
+ if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+ return !test_bit(segno, free_i->free_segmap);
+ return 0;
+}
+
/*
* Find a new segment from the free segments bitmap to right order
* This function should be returned with success, otherwise BUG
@@ -1355,8 +1668,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
- unsigned int hint = *newseg / sbi->segs_per_sec;
- unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+ unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
+ unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
unsigned int left_start = hint;
bool init = true;
int go_left = 0;
@@ -1366,8 +1679,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- (hint + 1) * sbi->segs_per_sec, *newseg + 1);
- if (segno < (hint + 1) * sbi->segs_per_sec)
+ GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
+ if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
goto got_it;
}
find_other_zone:
@@ -1398,8 +1711,8 @@ find_other_zone:
secno = left_start;
skip_left:
hint = secno;
- segno = secno * sbi->segs_per_sec;
- zoneno = secno / sbi->secs_per_zone;
+ segno = GET_SEG_FROM_SEC(sbi, secno);
+ zoneno = GET_ZONE_FROM_SEC(sbi, secno);
/* give up on finding another zone */
if (!init)
@@ -1443,7 +1756,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
struct summary_footer *sum_footer;
curseg->segno = curseg->next_segno;
- curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+ curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
curseg->next_blkoff = 0;
curseg->next_segno = NULL_SEGNO;
@@ -1456,6 +1769,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
__set_sit_entry_type(sbi, type, curseg->segno, modified);
}
+static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
+{
+ /* if segs_per_sec is large than 1, we need to keep original policy. */
+ if (sbi->segs_per_sec != 1)
+ return CURSEG_I(sbi, type)->segno;
+
+ if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ return 0;
+
+ if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
+ return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+ return CURSEG_I(sbi, type)->segno;
+}
+
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
@@ -1474,6 +1801,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
if (test_opt(sbi, NOHEAP))
dir = ALLOC_RIGHT;
+ segno = __get_next_segno(sbi, type);
get_new_segment(sbi, &segno, new_sec, dir);
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
@@ -1549,12 +1877,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+ unsigned segno = NULL_SEGNO;
int i, cnt;
bool reversed = false;
/* need_SSR() already forces to do this */
- if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR))
+ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+ curseg->next_segno = segno;
return 1;
+ }
/* For node segments, let's do SSR more intensively */
if (IS_NODESEG(type)) {
@@ -1578,9 +1909,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
for (; cnt-- > 0; reversed ? i-- : i++) {
if (i == type)
continue;
- if (v_ops->get_victim(sbi, &(curseg)->next_segno,
- BG_GC, i, SSR))
+ if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+ curseg->next_segno = segno;
return 1;
+ }
}
return 0;
}
@@ -1592,17 +1924,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
int type, bool force)
{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+
if (force)
new_curseg(sbi, type, true);
else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
+ else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+ new_curseg(sbi, type, false);
else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
- stat_inc_seg_type(sbi, CURSEG_I(sbi, type));
+ stat_inc_seg_type(sbi, curseg);
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -1734,18 +2070,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (p_type == DATA) {
struct inode *inode = page->mapping->host;
- if (S_ISDIR(inode->i_mode))
- return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || file_is_cold(inode))
+ if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
- else
- return CURSEG_WARM_DATA;
+ if (is_inode_flag_set(inode, FI_HOT_DATA))
+ return CURSEG_HOT_DATA;
+ return CURSEG_WARM_DATA;
} else {
if (IS_DNODE(page))
return is_cold_node(page) ? CURSEG_WARM_NODE :
CURSEG_HOT_NODE;
- else
- return CURSEG_COLD_NODE;
+ return CURSEG_COLD_NODE;
}
}
@@ -1788,15 +2122,14 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
stat_inc_block_count(sbi, curseg);
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
/*
- * SIT information should be updated before segment allocation,
- * since SSR needs latest valid block information.
+ * SIT information should be updated after segment allocation,
+ * since we need to keep dirty segments precisely under SSR.
*/
refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
- if (!__has_curseg_space(sbi, type))
- sit_i->s_ops->allocate_segment(sbi, type, false);
-
mutex_unlock(&sit_i->sentry_lock);
if (page && IS_NODESEG(type))
@@ -1868,11 +2201,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
}
-void rewrite_data_page(struct f2fs_io_info *fio)
+int rewrite_data_page(struct f2fs_io_info *fio)
{
fio->new_blkaddr = fio->old_blkaddr;
stat_inc_inplace_blocks(fio->sbi);
- f2fs_submit_page_mbio(fio);
+ return f2fs_submit_page_bio(fio);
}
void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -2437,7 +2770,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
se = get_seg_entry(sbi, segno);
/* add discard candidates */
- if (cpc->reason != CP_DISCARD) {
+ if (!(cpc->reason & CP_DISCARD)) {
cpc->trim_start = segno;
add_discard_addrs(sbi, cpc, false);
}
@@ -2473,7 +2806,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_bug_on(sbi, !list_empty(head));
f2fs_bug_on(sbi, sit_i->dirty_sentries);
out:
- if (cpc->reason == CP_DISCARD) {
+ if (cpc->reason & CP_DISCARD) {
__u64 trim_start = cpc->trim_start;
for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
@@ -2501,13 +2834,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) *
sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -2540,7 +2873,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) *
sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
@@ -2573,7 +2906,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+ sit_i->mounted_time = ktime_get_real_seconds();
mutex_init(&sit_i->sentry_lock);
return 0;
}
@@ -2591,12 +2924,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -2672,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
/* build discard map only one time */
if (f2fs_discard_en(sbi)) {
- memcpy(se->discard_map, se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += sbi->blocks_per_seg -
- se->valid_blocks;
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map,
+ se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks +=
+ sbi->blocks_per_seg -
+ se->valid_blocks;
+ }
}
if (sbi->segs_per_sec > 1)
@@ -2699,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
seg_info_from_raw_sit(se, &sit);
if (f2fs_discard_en(sbi)) {
- memcpy(se->discard_map, se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += old_valid_blocks -
- se->valid_blocks;
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map, se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += old_valid_blocks -
+ se->valid_blocks;
+ }
}
if (sbi->segs_per_sec > 1)
@@ -2746,7 +3091,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
if (segno >= MAIN_SEGS(sbi))
break;
offset = segno + 1;
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
continue;
if (valid_blocks > sbi->blocks_per_seg) {
@@ -2764,7 +3109,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -2786,7 +3131,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2852,6 +3197,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
+ sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
@@ -2988,7 +3334,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
if (!sm_info)
return;
destroy_flush_cmd_control(sbi, true);
- destroy_discard_cmd_control(sbi, true);
+ destroy_discard_cmd_control(sbi);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5e8ad4280a50..010f336a7573 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -21,78 +21,84 @@
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
/* L: Logical segment # in volume, R: Relative segment # in main area */
-#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
-#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
+#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
+#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
-#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
+#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
#define IS_CURSEG(sbi, seg) \
- ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
#define IS_CURSEC(sbi, secno) \
- ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- sbi->segs_per_sec)) \
+ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
+ (sbi)->segs_per_sec)) \
#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define MAIN_SECS(sbi) (sbi->total_sections)
+#define MAIN_SECS(sbi) ((sbi)->total_sections)
#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
-#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
-#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
- sbi->log_blocks_per_seg))
+#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \
+ (sbi)->log_blocks_per_seg))
#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
- (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+ (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
#define NEXT_FREE_BLKADDR(sbi, curseg) \
- (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
#define GET_SEGNO(sbi, blk_addr) \
- (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
+ ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define GET_SECNO(sbi, segno) \
- ((segno) / sbi->segs_per_sec)
-#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
- ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define BLKS_PER_SEC(sbi) \
+ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define GET_SEC_FROM_SEG(sbi, segno) \
+ ((segno) / (sbi)->segs_per_sec)
+#define GET_SEG_FROM_SEC(sbi, secno) \
+ ((secno) * (sbi)->segs_per_sec)
+#define GET_ZONE_FROM_SEC(sbi, secno) \
+ ((secno) / (sbi)->secs_per_zone)
+#define GET_ZONE_FROM_SEG(sbi, segno) \
+ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
#define GET_SUM_BLOCK(sbi, segno) \
- ((sbi->sm_info->ssa_blkaddr) + segno)
+ ((sbi)->sm_info->ssa_blkaddr + (segno))
#define GET_SUM_TYPE(footer) ((footer)->entry_type)
-#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
#define SIT_ENTRY_OFFSET(sit_i, segno) \
- (segno % sit_i->sents_per_block)
+ ((segno) % (sit_i)->sents_per_block)
#define SIT_BLOCK_OFFSET(segno) \
- (segno / SIT_ENTRY_PER_BLOCK)
+ ((segno) / SIT_ENTRY_PER_BLOCK)
#define START_SEGNO(segno) \
(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
#define SIT_BLK_CNT(sbi) \
@@ -103,7 +109,7 @@
#define SECTOR_FROM_BLOCK(blk_addr) \
(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
#define SECTOR_TO_BLOCK(sectors) \
- (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -132,7 +138,10 @@ enum {
*/
enum {
GC_CB = 0,
- GC_GREEDY
+ GC_GREEDY,
+ ALLOC_NEXT,
+ FLUSH_DEVICE,
+ MAX_GC_POLICY,
};
/*
@@ -227,6 +236,8 @@ struct sit_info {
unsigned long long mounted_time; /* mount time */
unsigned long long min_mtime; /* min. modification time */
unsigned long long max_mtime; /* max. modification time */
+
+ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
};
struct free_segmap_info {
@@ -303,17 +314,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+ return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)];
}
static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
- unsigned int segno, int section)
+ unsigned int segno, bool use_section)
{
/*
* In order to get # of valid blocks in a section instantly from many
* segments, f2fs manages two counting structures separately.
*/
- if (section > 1)
+ if (use_section && sbi->segs_per_sec > 1)
return get_sec_entry(sbi, segno)->valid_blocks;
else
return get_seg_entry(sbi, segno)->valid_blocks;
@@ -358,8 +369,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
spin_lock(&free_i->segmap_lock);
@@ -379,7 +390,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
set_bit(segno, free_i->free_segmap);
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
@@ -390,8 +402,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
spin_lock(&free_i->segmap_lock);
@@ -412,7 +424,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
spin_lock(&free_i->segmap_lock);
if (!test_and_set_bit(segno, free_i->free_segmap)) {
free_i->free_segments--;
@@ -477,12 +490,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
static inline int overprovision_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi));
}
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
}
static inline bool need_SSR(struct f2fs_sb_info *sbi)
@@ -495,7 +508,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
return false;
return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
- reserved_sections(sbi) + 1);
+ 2 * reserved_sections(sbi));
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
@@ -540,6 +553,7 @@ static inline int utilization(struct f2fs_sb_info *sbi)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
+#define DEF_MIN_HOT_BLOCKS 16
enum {
F2FS_IPU_FORCE,
@@ -547,17 +561,15 @@ enum {
F2FS_IPU_UTIL,
F2FS_IPU_SSR_UTIL,
F2FS_IPU_FSYNC,
+ F2FS_IPU_ASYNC,
};
-static inline bool need_inplace_update(struct inode *inode)
+static inline bool need_inplace_update_policy(struct inode *inode,
+ struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
- /* IPU can be done only for the user data */
- if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
- return false;
-
if (test_opt(sbi, LFS))
return false;
@@ -572,6 +584,15 @@ static inline bool need_inplace_update(struct inode *inode)
utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
+ /*
+ * IPU for rewrite async pages
+ */
+ if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+ fio && fio->op == REQ_OP_WRITE &&
+ !(fio->op_flags & REQ_SYNC) &&
+ !f2fs_encrypted_inode(inode))
+ return true;
+
/* this is only set during fdatasync */
if (policy & (0x1 << F2FS_IPU_FSYNC) &&
is_inode_flag_set(inode, FI_NEED_IPU))
@@ -691,8 +712,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
- return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
- sit_i->mounted_time;
+ time64_t now = ktime_get_real_seconds();
+
+ return sit_i->elapsed_time + now - sit_i->mounted_time;
}
static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
@@ -719,7 +741,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
unsigned int secno)
{
- if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >=
+ if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
sbi->fggc_threshold)
return true;
return false;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 96fe8ed73100..83355ec4a92c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = {
[FAULT_BLOCK] = "no more block",
[FAULT_DIR_DEPTH] = "too big dir depth",
[FAULT_EVICT_INODE] = "evict_inode fail",
+ [FAULT_TRUNCATE] = "truncate fail",
[FAULT_IO] = "IO error",
[FAULT_CHECKPOINT] = "checkpoint error",
};
@@ -82,6 +83,7 @@ enum {
Opt_discard,
Opt_nodiscard,
Opt_noheap,
+ Opt_heap,
Opt_user_xattr,
Opt_nouser_xattr,
Opt_acl,
@@ -116,6 +118,7 @@ static match_table_t f2fs_tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_noheap, "no_heap"},
+ {Opt_heap, "heap"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
@@ -293,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -318,6 +322,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
+ ATTR_LIST(min_hot_blocks),
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
@@ -436,6 +441,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
+ case Opt_heap:
+ clear_opt(sbi, NOHEAP);
+ break;
#ifdef CONFIG_F2FS_FS_XATTR
case Opt_user_xattr:
set_opt(sbi, XATTR_USER);
@@ -787,7 +795,14 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- f2fs_wait_discard_bio(sbi, NULL_ADDR);
+ f2fs_wait_discard_bios(sbi);
+
+ if (!sbi->discard_blks) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT | CP_TRIMMED,
+ };
+ write_checkpoint(sbi, &cpc);
+ }
/* write_checkpoint can update stat informaion */
f2fs_destroy_stats(sbi);
@@ -913,7 +928,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
if (test_opt(sbi, NOHEAP))
- seq_puts(seq, ",no_heap_alloc");
+ seq_puts(seq, ",no_heap");
+ else
+ seq_puts(seq, ",heap");
#ifdef CONFIG_F2FS_FS_XATTR
if (test_opt(sbi, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -986,7 +1003,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
if ((i % 10) == 0)
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u", se->type,
- get_valid_blocks(sbi, i, 1));
+ get_valid_blocks(sbi, i, false));
if ((i % 10) == 9 || i == (total_segs - 1))
seq_putc(seq, '\n');
else
@@ -1012,7 +1029,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset)
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u|", se->type,
- get_valid_blocks(sbi, i, 1));
+ get_valid_blocks(sbi, i, false));
for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
seq_printf(seq, " %.2x", se->cur_valid_map[j]);
seq_putc(seq, '\n');
@@ -1046,6 +1063,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, NOHEAP);
sbi->sb->s_flags |= MS_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
@@ -1307,7 +1325,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
unlock_buffer(bh);
/* it's rare case, we can do fua all the time */
- return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
+ return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
}
static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1483,6 +1501,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return 1;
}
+ if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid segment count (%u)",
+ le32_to_cpu(raw_super->segment_count));
+ return 1;
+ }
+
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
if (sanity_check_area_boundary(sbi, bh))
return 1;
@@ -1555,6 +1580,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
+ atomic_set(&sbi->wb_sync_req, 0);
+
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
mutex_init(&sbi->wio_mutex[NODE]);
@@ -1917,6 +1944,7 @@ try_onemore:
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
+ init_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
@@ -2022,6 +2050,10 @@ try_onemore:
f2fs_join_shrinker(sbi);
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto free_nm;
+
/* if there are nt orphan nodes free them */
err = recover_orphan_inodes(sbi);
if (err)
@@ -2046,10 +2078,6 @@ try_onemore:
goto free_root_inode;
}
- err = f2fs_build_stats(sbi);
- if (err)
- goto free_root_inode;
-
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -2143,7 +2171,6 @@ free_proc:
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
- f2fs_destroy_stats(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
@@ -2161,6 +2188,7 @@ free_node_inode:
truncate_inode_pages_final(META_MAPPING(sbi));
iput(sbi->node_inode);
mutex_unlock(&sbi->umount_mutex);
+ f2fs_destroy_stats(sbi);
free_nm:
destroy_node_manager(sbi);
free_sm:
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 73b4e1d1912a..bccbbf2616d2 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page)
pid_t pid = task_pid_nr(current);
void *p;
- page->private = pid;
+ set_page_private(page, (unsigned long)pid);
if (radix_tree_preload(GFP_NOFS))
return;
@@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
results[ret] = iter.index;
- if (++ret == PIDVEC_SIZE)
+ if (++ret == max_items)
break;
}
return ret;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7298a4488f7f..832c5110abab 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -250,15 +250,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
void *cur_addr, *txattr_addr, *last_addr = NULL;
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
- unsigned int inline_size = 0;
+ unsigned int inline_size = inline_xattr_size(inode);
int err = 0;
- inline_size = inline_xattr_size(inode);
-
if (!size && !inline_size)
return -ENODATA;
- txattr_addr = kzalloc(inline_size + size + sizeof(__u32),
+ txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
GFP_F2FS_ZERO);
if (!txattr_addr)
return -ENOMEM;
@@ -328,13 +326,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_header *header;
- size_t size = PAGE_SIZE, inline_size = 0;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int size = VALID_XATTR_BLOCK_SIZE;
+ unsigned int inline_size = inline_xattr_size(inode);
void *txattr_addr;
int err;
- inline_size = inline_xattr_size(inode);
-
- txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+ txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+ GFP_F2FS_ZERO);
if (!txattr_addr)
return -ENOMEM;
@@ -358,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
}
/* read from xattr node block */
- if (F2FS_I(inode)->i_xattr_nid) {
+ if (xnid) {
struct page *xpage;
void *xattr_addr;
/* The inode already has an extended attribute block. */
- xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+ xpage = get_node_page(sbi, xnid);
if (IS_ERR(xpage)) {
err = PTR_ERR(xpage);
goto fail;
}
xattr_addr = page_address(xpage);
- memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+ memcpy(txattr_addr + inline_size, xattr_addr, size);
f2fs_put_page(xpage, 1);
}
@@ -392,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
void *txattr_addr, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- size_t inline_size = 0;
+ size_t inline_size = inline_xattr_size(inode);
void *xattr_addr;
struct page *xpage;
nid_t new_nid = 0;
int err;
- inline_size = inline_xattr_size(inode);
-
if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
if (!alloc_nid(sbi, &new_nid))
return -ENOSPC;
@@ -454,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
}
xattr_addr = page_address(xpage);
- memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE);
+ memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
set_page_dirty(xpage);
f2fs_put_page(xpage, 1);
@@ -546,7 +543,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
const void *value, size_t size)
{
void *pval = entry->e_name + entry->e_name_len;
- return (entry->e_value_size == size) && !memcmp(pval, value, size);
+
+ return (le16_to_cpu(entry->e_value_size) == size) &&
+ !memcmp(pval, value, size);
}
static int __f2fs_setxattr(struct inode *inode, int index,
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index d5a94928c116..dbcd1d16e669 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -58,10 +58,10 @@ struct f2fs_xattr_entry {
#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
#define XATTR_ROUND (3)
-#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND)
#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
- entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+ (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)))
#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
ENTRY_SIZE(entry)))
@@ -72,8 +72,8 @@ struct f2fs_xattr_entry {
for (entry = XATTR_FIRST_ENTRY(addr);\
!IS_XATTR_LAST_ENTRY(entry);\
entry = XATTR_NEXT_ENTRY(entry))
-#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
-#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32))
+#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
+#define XATTR_PADDING_SIZE (sizeof(__u32))
#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
VALID_XATTR_BLOCK_SIZE)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index be8fbe289087..f4e7267d117f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -23,6 +23,7 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/shmem_fs.h>
+#include <linux/compat.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
@@ -420,6 +421,162 @@ out:
}
#endif
+#ifdef CONFIG_COMPAT
+static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+ if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+ __get_user(kfl->l_type, &ufl->l_type) ||
+ __get_user(kfl->l_whence, &ufl->l_whence) ||
+ __get_user(kfl->l_start, &ufl->l_start) ||
+ __get_user(kfl->l_len, &ufl->l_len) ||
+ __get_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+
+static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+ if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+ __put_user(kfl->l_type, &ufl->l_type) ||
+ __put_user(kfl->l_whence, &ufl->l_whence) ||
+ __put_user(kfl->l_start, &ufl->l_start) ||
+ __put_user(kfl->l_len, &ufl->l_len) ||
+ __put_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+
+#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
+static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+ if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+ __get_user(kfl->l_type, &ufl->l_type) ||
+ __get_user(kfl->l_whence, &ufl->l_whence) ||
+ __get_user(kfl->l_start, &ufl->l_start) ||
+ __get_user(kfl->l_len, &ufl->l_len) ||
+ __get_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
+static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+ if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+ __put_user(kfl->l_type, &ufl->l_type) ||
+ __put_user(kfl->l_whence, &ufl->l_whence) ||
+ __put_user(kfl->l_start, &ufl->l_start) ||
+ __put_user(kfl->l_len, &ufl->l_len) ||
+ __put_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+static unsigned int
+convert_fcntl_cmd(unsigned int cmd)
+{
+ switch (cmd) {
+ case F_GETLK64:
+ return F_GETLK;
+ case F_SETLK64:
+ return F_SETLK;
+ case F_SETLKW64:
+ return F_SETLKW;
+ }
+
+ return cmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
+{
+ mm_segment_t old_fs;
+ struct flock f;
+ long ret;
+ unsigned int conv_cmd;
+
+ switch (cmd) {
+ case F_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
+ ret = get_compat_flock(&f, compat_ptr(arg));
+ if (ret != 0)
+ break;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_fcntl(fd, cmd, (unsigned long)&f);
+ set_fs(old_fs);
+ if (cmd == F_GETLK && ret == 0) {
+ /* GETLK was successful and we need to return the data...
+ * but it needs to fit in the compat structure.
+ * l_start shouldn't be too big, unless the original
+ * start + end is greater than COMPAT_OFF_T_MAX, in which
+ * case the app was asking for trouble, so we return
+ * -EOVERFLOW in that case.
+ * l_len could be too big, in which case we just truncate it,
+ * and only allow the app to see that part of the conflicting
+ * lock that might make sense to it anyway
+ */
+
+ if (f.l_start > COMPAT_OFF_T_MAX)
+ ret = -EOVERFLOW;
+ if (f.l_len > COMPAT_OFF_T_MAX)
+ f.l_len = COMPAT_OFF_T_MAX;
+ if (ret == 0)
+ ret = put_compat_flock(&f, compat_ptr(arg));
+ }
+ break;
+
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ ret = get_compat_flock64(&f, compat_ptr(arg));
+ if (ret != 0)
+ break;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ conv_cmd = convert_fcntl_cmd(cmd);
+ ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
+ set_fs(old_fs);
+ if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
+ /* need to return lock information - see above for commentary */
+ if (f.l_start > COMPAT_LOFF_T_MAX)
+ ret = -EOVERFLOW;
+ if (f.l_len > COMPAT_LOFF_T_MAX)
+ f.l_len = COMPAT_LOFF_T_MAX;
+ if (ret == 0)
+ ret = put_compat_flock64(&f, compat_ptr(arg));
+ }
+ break;
+
+ default:
+ ret = sys_fcntl(fd, cmd, arg);
+ break;
+ }
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
+{
+ switch (cmd) {
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ return -EINVAL;
+ }
+ return compat_sys_fcntl64(fd, cmd, arg);
+}
+#endif
+
/* Table to convert sigio signal codes into poll band bitmaps */
static const long band_table[NSIGPOLL] = {
@@ -742,16 +899,10 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
- O_RDONLY | O_WRONLY | O_RDWR |
- O_CREAT | O_EXCL | O_NOCTTY |
- O_TRUNC | O_APPEND | /* O_NONBLOCK | */
- __O_SYNC | O_DSYNC | FASYNC |
- O_DIRECT | O_LARGEFILE | O_DIRECTORY |
- O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH | __O_TMPFILE |
- __FMODE_NONOTIFY
- ));
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+ HWEIGHT32(
+ (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
+ __FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 5559168d5637..58a61f55e0d0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -9,6 +9,7 @@
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include "internal.h"
#include "mount.h"
@@ -264,3 +265,15 @@ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
ret = do_handle_open(mountdirfd, handle, flags);
return ret;
}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+ struct file_handle __user *, handle, int, flags)
+{
+ return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/file.c b/fs/file.c
index ad6f094f2eff..1c2972e3a405 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -42,7 +42,7 @@ static void *alloc_fdmem(size_t size)
if (data != NULL)
return data;
}
- return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 6e22748b0704..b9ea99c5b5b3 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct tree_descr empty_descr = {""};
+ static const struct tree_descr empty_descr = {""};
struct fuse_conn *fc;
int err;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b681b43c766e..c16d00e53264 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -20,6 +20,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
+#include <linux/sched.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -45,7 +46,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
INIT_LIST_HEAD(&req->list);
INIT_LIST_HEAD(&req->intr_entry);
init_waitqueue_head(&req->waitq);
- atomic_set(&req->count, 1);
+ refcount_set(&req->count, 1);
req->pages = pages;
req->page_descs = page_descs;
req->max_pages = npages;
@@ -102,21 +103,20 @@ void fuse_request_free(struct fuse_req *req)
void __fuse_get_request(struct fuse_req *req)
{
- atomic_inc(&req->count);
+ refcount_inc(&req->count);
}
/* Must be called with > 1 refcount */
static void __fuse_put_request(struct fuse_req *req)
{
- BUG_ON(atomic_read(&req->count) < 2);
- atomic_dec(&req->count);
+ refcount_dec(&req->count);
}
-static void fuse_req_init_context(struct fuse_req *req)
+static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
{
req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
- req->in.h.pid = current->pid;
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}
void fuse_set_initialized(struct fuse_conn *fc)
@@ -163,7 +163,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
goto out;
}
- fuse_req_init_context(req);
+ fuse_req_init_context(fc, req);
__set_bit(FR_WAITING, &req->flags);
if (for_background)
__set_bit(FR_BACKGROUND, &req->flags);
@@ -256,7 +256,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
if (!req)
req = get_reserved_req(fc, file);
- fuse_req_init_context(req);
+ fuse_req_init_context(fc, req);
__set_bit(FR_WAITING, &req->flags);
__clear_bit(FR_BACKGROUND, &req->flags);
return req;
@@ -264,7 +264,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
{
- if (atomic_dec_and_test(&req->count)) {
+ if (refcount_dec_and_test(&req->count)) {
if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
* We get here in the unlikely case that a background
@@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
if (fc->num_background == fc->congestion_threshold &&
- fc->connected && fc->bdi_initialized) {
- clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ fc->connected && fc->sb) {
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold &&
- fc->bdi_initialized) {
- set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
@@ -1223,6 +1222,9 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_in *in;
unsigned reqsize;
+ if (task_active_pid_ns(current) != fc->pid_ns)
+ return -EIO;
+
restart:
spin_lock(&fiq->waitq.lock);
err = -EAGAIN;
@@ -1821,6 +1823,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
+ if (task_active_pid_ns(current) != fc->pid_ns)
+ return -EIO;
+
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 995da8957f6f..3ee4fdc3da9e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
}
INIT_LIST_HEAD(&ff->write_entry);
- atomic_set(&ff->count, 1);
+ refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
@@ -77,7 +77,7 @@ void fuse_file_free(struct fuse_file *ff)
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
{
- atomic_inc(&ff->count);
+ refcount_inc(&ff->count);
return ff;
}
@@ -88,7 +88,7 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
static void fuse_file_put(struct fuse_file *ff, bool sync)
{
- if (atomic_dec_and_test(&ff->count)) {
+ if (refcount_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
if (ff->fc->no_open) {
@@ -293,7 +293,7 @@ static int fuse_release(struct inode *inode, struct file *file)
void fuse_sync_release(struct fuse_file *ff, int flags)
{
- WARN_ON(atomic_read(&ff->count) != 1);
+ WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
@@ -2083,7 +2083,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
return generic_file_mmap(file, vma);
}
-static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+static int convert_fuse_file_lock(struct fuse_conn *fc,
+ const struct fuse_file_lock *ffl,
struct file_lock *fl)
{
switch (ffl->type) {
@@ -2098,7 +2099,14 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
fl->fl_start = ffl->start;
fl->fl_end = ffl->end;
- fl->fl_pid = ffl->pid;
+
+ /*
+ * Convert pid into the caller's pid namespace. If the pid
+ * does not map into the namespace fl_pid will get set to 0.
+ */
+ rcu_read_lock();
+ fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns));
+ rcu_read_unlock();
break;
default:
@@ -2147,7 +2155,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err)
- err = convert_fuse_file_lock(&outarg.lk, fl);
+ err = convert_fuse_file_lock(fc, &outarg.lk, fl);
return err;
}
@@ -2159,7 +2167,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
- pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+ struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+ pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
@@ -2171,7 +2180,10 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
- fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
+ if (pid && pid_nr == 0)
+ return -EOVERFLOW;
+
+ fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fc, &args);
/* locking is restartable */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 32ac2c9b09c0..1bd7ffdad593 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -24,6 +24,8 @@
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/xattr.h>
+#include <linux/pid_namespace.h>
+#include <linux/refcount.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -137,7 +139,7 @@ struct fuse_file {
u64 nodeid;
/** Refcount */
- atomic_t count;
+ refcount_t count;
/** FOPEN_* flags returned by open */
u32 open_flags;
@@ -306,7 +308,7 @@ struct fuse_req {
struct list_head intr_entry;
/** refcount */
- atomic_t count;
+ refcount_t count;
/** Unique ID for the interrupt request */
u64 intr_unique;
@@ -448,7 +450,7 @@ struct fuse_conn {
spinlock_t lock;
/** Refcount */
- atomic_t count;
+ refcount_t count;
/** Number of fuse_dev's */
atomic_t dev_count;
@@ -461,6 +463,9 @@ struct fuse_conn {
/** The group id for this mount */
kgid_t group_id;
+ /** The pid namespace for this mount */
+ struct pid_namespace *pid_ns;
+
/** Maximum read size */
unsigned max_read;
@@ -527,9 +532,6 @@ struct fuse_conn {
/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;
- /** Set if bdi is valid */
- unsigned bdi_initialized:1;
-
/** write-back cache policy (default is write-through) */
unsigned writeback_cache:1;
@@ -631,9 +633,6 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Backing dev info */
- struct backing_dev_info bdi;
-
/** Entry on the fuse_conn_list */
struct list_head entry;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6fe6a88ecb4a..5a1b58f8fef4 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -21,6 +21,7 @@
#include <linux/sched.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
+#include <linux/pid_namespace.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -386,12 +387,6 @@ static void fuse_send_destroy(struct fuse_conn *fc)
}
}
-static void fuse_bdi_destroy(struct fuse_conn *fc)
-{
- if (fc->bdi_initialized)
- bdi_destroy(&fc->bdi);
-}
-
static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -403,7 +398,6 @@ static void fuse_put_super(struct super_block *sb)
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
mutex_unlock(&fuse_mutex);
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
}
@@ -608,7 +602,7 @@ void fuse_conn_init(struct fuse_conn *fc)
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
init_rwsem(&fc->killsb);
- atomic_set(&fc->count, 1);
+ refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
init_waitqueue_head(&fc->blocked_waitq);
init_waitqueue_head(&fc->reserved_req_waitq);
@@ -626,14 +620,16 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->connected = 1;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+ fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
void fuse_conn_put(struct fuse_conn *fc)
{
- if (atomic_dec_and_test(&fc->count)) {
+ if (refcount_dec_and_test(&fc->count)) {
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
+ put_pid_ns(fc->pid_ns);
fc->release(fc);
}
}
@@ -641,7 +637,7 @@ EXPORT_SYMBOL_GPL(fuse_conn_put);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
{
- atomic_inc(&fc->count);
+ refcount_inc(&fc->count);
return fc;
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
@@ -928,7 +924,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->no_flock = 1;
}
- fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+ fc->sb->s_bdi->ra_pages =
+ min(fc->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -944,7 +941,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
- arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
+ arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -976,27 +973,18 @@ static void fuse_free_conn(struct fuse_conn *fc)
static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ char *suffix = "";
- fc->bdi.name = "fuse";
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
-
- err = bdi_init(&fc->bdi);
+ if (sb->s_bdev)
+ suffix = "-fuseblk";
+ err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+ MINOR(fc->dev), suffix);
if (err)
return err;
- fc->bdi_initialized = 1;
-
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
-
- if (err)
- return err;
+ sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ /* fuse does it's own writeback accounting */
+ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
@@ -1010,7 +998,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
*
* /sys/class/bdi/<bdi>/max_ratio
*/
- bdi_set_max_ratio(&fc->bdi, 1);
+ bdi_set_max_ratio(sb->s_bdi, 1);
return 0;
}
@@ -1113,8 +1101,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_dev_free;
- sb->s_bdi = &fc->bdi;
-
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1182,7 +1168,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
err_dev_free:
fuse_dev_free(fud);
err_put_conn:
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
err_fput:
fput(file);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 01b97c012c6e..4d810be532dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,11 +38,6 @@ struct metapath {
__u16 mp_list[GFS2_MAX_META_HEIGHT];
};
-struct strip_mine {
- int sm_first;
- unsigned int sm_height;
-};
-
/**
* gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
* @ip: the inode
@@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
}
/**
+ * metaptr1 - Return the first possible metadata pointer in a metaath buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ */
+static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
+{
+ struct buffer_head *bh = mp->mp_bh[height];
+ if (height == 0)
+ return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
+ return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
+}
+
+/**
* metapointer - Return pointer to start of metadata in a buffer
* @height: The metadata height (0 = dinode)
* @mp: The metapath
@@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
{
- struct buffer_head *bh = mp->mp_bh[height];
- unsigned int head_size = (height > 0) ?
- sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
- return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+ __be64 *p = metaptr1(height, mp);
+ return p + mp->mp_list[height];
}
static void gfs2_metapath_ra(struct gfs2_glock *gl,
@@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
}
/**
+ * lookup_mp_height - helper function for lookup_metapath
+ * @ip: the inode
+ * @mp: the metapath
+ * @h: the height which needs looking up
+ */
+static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+ __be64 *ptr = metapointer(h, mp);
+ u64 dblock = be64_to_cpu(*ptr);
+
+ if (!dblock)
+ return h + 1;
+
+ return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
+}
+
+/**
* lookup_metapath - Walk the metadata tree to a specific point
* @ip: The inode
* @mp: The metapath
@@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
{
unsigned int end_of_metadata = ip->i_height - 1;
unsigned int x;
- __be64 *ptr;
- u64 dblock;
int ret;
for (x = 0; x < end_of_metadata; x++) {
- ptr = metapointer(x, mp);
- dblock = be64_to_cpu(*ptr);
- if (!dblock)
- return x + 1;
-
- ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
+ ret = lookup_mp_height(ip, mp, x);
if (ret)
return ret;
}
@@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
return ip->i_height;
}
+/**
+ * fillup_metapath - fill up buffers for the metadata path to a specific height
+ * @ip: The inode
+ * @mp: The metapath
+ * @h: The height to which it should be mapped
+ *
+ * Similar to lookup_metapath, but does lookups for a range of heights
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+ unsigned int start_h = h - 1;
+ int ret;
+
+ if (h) {
+ /* find the first buffer we need to look up. */
+ while (start_h > 0 && mp->mp_bh[start_h] == NULL)
+ start_h--;
+ for (; start_h < h; start_h++) {
+ ret = lookup_mp_height(ip, mp, start_h);
+ if (ret)
+ return ret;
+ }
+ }
+ return ip->i_height;
+}
+
static inline void release_metapath(struct metapath *mp)
{
int i;
@@ -422,6 +467,13 @@ enum alloc_state {
/* ALLOC_UNSTUFF = 3, TBD and rather complicated */
};
+static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
+{
+ if (hgt)
+ return sdp->sd_inptrs;
+ return sdp->sd_diptrs;
+}
+
/**
* gfs2_bmap_alloc - Build a metadata tree of the requested height
* @inode: The GFS2 inode
@@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
BUG_ON(maxlen == 0);
- memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+ memset(&mp, 0, sizeof(mp));
bmap_lock(ip, create);
clear_buffer_mapped(bh_map);
clear_buffer_new(bh_map);
@@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
}
/**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @sm: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct buffer_head *bh, __be64 *top, __be64 *bottom,
- unsigned int height, struct strip_mine *sm)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_rgrp_list rlist;
- struct gfs2_trans *tr;
- u64 bn, bstart;
- u32 blen, btotal;
- __be64 *p;
- unsigned int rg_blocks = 0;
- int metadata;
- unsigned int revokes = 0;
- int x;
- int error;
- int jblocks_rqsted;
-
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
-
- if (!*top)
- sm->sm_first = 0;
-
- if (height != sm->sm_height)
- return 0;
-
- if (sm->sm_first) {
- top++;
- sm->sm_first = 0;
- }
-
- metadata = (height != ip->i_height - 1);
- if (metadata)
- revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- else if (ip->i_depth)
- revokes = sdp->sd_inptrs;
-
- memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- bstart = 0;
- blen = 0;
-
- for (p = top; p < bottom; p++) {
- if (!*p)
- continue;
-
- bn = be64_to_cpu(*p);
-
- if (bstart + blen == bn)
- blen++;
- else {
- if (bstart)
- gfs2_rlist_add(ip, &rlist, bstart);
-
- bstart = bn;
- blen = 1;
- }
- }
-
- if (bstart)
- gfs2_rlist_add(ip, &rlist, bstart);
- else
- goto out; /* Nothing to do */
-
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
- for (x = 0; x < rlist.rl_rgrps; x++) {
- struct gfs2_rgrpd *rgd;
- rgd = rlist.rl_ghs[x].gh_gl->gl_object;
- rg_blocks += rgd->rd_length;
- }
-
- error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
- if (error)
- goto out_rlist;
-
- if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(&ip->i_res);
-
-restart:
- jblocks_rqsted = rg_blocks + RES_DINODE +
- RES_INDIRECT + RES_STATFS + RES_QUOTA +
- gfs2_struct2blk(sdp, revokes, sizeof(u64));
- if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
- jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
- error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
- if (error)
- goto out_rg_gunlock;
-
- tr = current->journal_info;
- down_write(&ip->i_rw_mutex);
-
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_trans_add_meta(ip->i_gl, bh);
-
- bstart = 0;
- blen = 0;
- btotal = 0;
-
- for (p = top; p < bottom; p++) {
- if (!*p)
- continue;
-
- /* check for max reasonable journal transaction blocks */
- if (tr->tr_num_buf_new + RES_STATFS +
- RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
- if (rg_blocks >= tr->tr_num_buf_new)
- rg_blocks -= tr->tr_num_buf_new;
- else
- rg_blocks = 0;
- break;
- }
-
- bn = be64_to_cpu(*p);
-
- if (bstart + blen == bn)
- blen++;
- else {
- if (bstart) {
- __gfs2_free_blocks(ip, bstart, blen, metadata);
- btotal += blen;
- }
-
- bstart = bn;
- blen = 1;
- }
-
- *p = 0;
- gfs2_add_inode_blocks(&ip->i_inode, -1);
- }
- if (p == bottom)
- rg_blocks = 0;
-
- if (bstart) {
- __gfs2_free_blocks(ip, bstart, blen, metadata);
- btotal += blen;
- }
-
- gfs2_statfs_change(sdp, 0, +btotal, 0);
- gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
- ip->i_inode.i_gid);
-
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
-
- gfs2_dinode_out(ip, dibh->b_data);
-
- up_write(&ip->i_rw_mutex);
-
- gfs2_trans_end(sdp);
-
- if (rg_blocks)
- goto restart;
-
-out_rg_gunlock:
- gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
- gfs2_rlist_free(&rlist);
-out:
- return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct metapath *mp, unsigned int height,
- u64 block, int first, struct strip_mine *sm)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *bh = NULL;
- __be64 *top, *bottom;
- u64 bn;
- int error;
- int mh_size = sizeof(struct gfs2_meta_header);
-
- if (!height) {
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- return error;
- dibh = bh;
-
- top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
- bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
- } else {
- error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
- if (error)
- return error;
-
- top = (__be64 *)(bh->b_data + mh_size) +
- (first ? mp->mp_list[height] : 0);
-
- bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
- }
-
- error = do_strip(ip, dibh, bh, top, bottom, height, sm);
- if (error)
- goto out;
-
- if (height < ip->i_height - 1) {
-
- gfs2_metapath_ra(ip->i_gl, bh, top);
-
- for (; top < bottom; top++, first = 0) {
- if (!*top)
- continue;
-
- bn = be64_to_cpu(*top);
-
- error = recursive_scan(ip, dibh, mp, height + 1, bn,
- first, sm);
- if (error)
- break;
- }
- }
-out:
- brelse(bh);
- return error;
-}
-
-
-/**
* gfs2_block_truncate_page - Deal with zeroing out data for truncate
*
* This is partly borrowed from ext3.
@@ -1106,41 +912,406 @@ out:
return error;
}
-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/**
+ * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
+ * @ip: inode
+ * @rg_gh: holder of resource group glock
+ * @mp: current metapath fully populated with buffers
+ * @btotal: place to keep count of total blocks freed
+ * @hgt: height we're processing
+ * @first: true if this is the first call to this function for this height
+ *
+ * We sweep a metadata buffer (provided by the metapath) for blocks we need to
+ * free, and free them all. However, we do it one rgrp at a time. If this
+ * block has references to multiple rgrps, we break it into individual
+ * transactions. This allows other processes to use the rgrps while we're
+ * focused on a single one, for better concurrency / performance.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're interrupted by power-outages.
+ *
+ * Returns: 0, or return code if an error occurred.
+ * *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
+ const struct metapath *mp, u32 *btotal, int hgt,
+ bool preserve1)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int height = ip->i_height;
- u64 lblock;
- struct metapath mp;
- int error;
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_trans *tr;
+ struct buffer_head *bh = mp->mp_bh[hgt];
+ __be64 *top, *bottom, *p;
+ int blks_outside_rgrp;
+ u64 bn, bstart, isize_blks;
+ s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
+ int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+ int ret = 0;
+ bool buf_in_tr = false; /* buffer was added to transaction */
+
+ if (gfs2_metatype_check(sdp, bh,
+ (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
+ return -EIO;
+
+more_rgrps:
+ blks_outside_rgrp = 0;
+ bstart = 0;
+ blen = 0;
+ top = metapointer(hgt, mp); /* first ptr from metapath */
+ /* If we're keeping some data at the truncation point, we've got to
+ preserve the metadata tree by adding 1 to the starting metapath. */
+ if (preserve1)
+ top++;
+
+ bottom = (__be64 *)(bh->b_data + bh->b_size);
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+ bn = be64_to_cpu(*p);
+ if (gfs2_holder_initialized(rd_gh)) {
+ rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+ gfs2_assert_withdraw(sdp,
+ gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
+ } else {
+ rgd = gfs2_blk2rgrpd(sdp, bn, false);
+ ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+ 0, rd_gh);
+ if (ret)
+ goto out;
+
+ /* Must be done with the rgrp glock held: */
+ if (gfs2_rs_active(&ip->i_res) &&
+ rgd == ip->i_res.rs_rbm.rgd)
+ gfs2_rs_deltree(&ip->i_res);
+ }
+
+ if (!rgrp_contains_block(rgd, bn)) {
+ blks_outside_rgrp++;
+ continue;
+ }
+
+ /* The size of our transactions will be unknown until we
+ actually process all the metadata blocks that relate to
+ the rgrp. So we estimate. We know it can't be more than
+ the dinode's i_blocks and we don't want to exceed the
+ journal flush threshold, sd_log_thresh2. */
+ if (current->journal_info == NULL) {
+ unsigned int jblocks_rqsted, revokes;
+
+ jblocks_rqsted = rgd->rd_length + RES_DINODE +
+ RES_INDIRECT;
+ isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+ if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+ jblocks_rqsted +=
+ atomic_read(&sdp->sd_log_thresh2);
+ else
+ jblocks_rqsted += isize_blks;
+ revokes = jblocks_rqsted;
+ if (meta)
+ revokes += hptrs(sdp, hgt);
+ else if (ip->i_depth)
+ revokes += sdp->sd_inptrs;
+ ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+ if (ret)
+ goto out_unlock;
+ down_write(&ip->i_rw_mutex);
+ }
+ /* check if we will exceed the transaction blocks requested */
+ tr = current->journal_info;
+ if (tr->tr_num_buf_new + RES_STATFS +
+ RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+ /* We set blks_outside_rgrp to ensure the loop will
+ be repeated for the same rgrp, but with a new
+ transaction. */
+ blks_outside_rgrp++;
+ /* This next part is tricky. If the buffer was added
+ to the transaction, we've already set some block
+ pointers to 0, so we better follow through and free
+ them, or we will introduce corruption (so break).
+ This may be impossible, or at least rare, but I
+ decided to cover the case regardless.
+
+ If the buffer was not added to the transaction
+ (this call), doing so would exceed our transaction
+ size, so we need to end the transaction and start a
+ new one (so goto). */
+
+ if (buf_in_tr)
+ break;
+ goto out_unlock;
+ }
+
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ buf_in_tr = true;
+ *p = 0;
+ if (bstart + blen == bn) {
+ blen++;
+ continue;
+ }
+ if (bstart) {
+ __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ (*btotal) += blen;
+ gfs2_add_inode_blocks(&ip->i_inode, -blen);
+ }
+ bstart = bn;
+ blen = 1;
+ }
+ if (bstart) {
+ __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ (*btotal) += blen;
+ gfs2_add_inode_blocks(&ip->i_inode, -blen);
+ }
+out_unlock:
+ if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+ outside the rgrp we just processed,
+ do it all over again. */
+ if (current->journal_info) {
+ struct buffer_head *dibh = mp->mp_bh[0];
+
+ /* Every transaction boundary, we rewrite the dinode
+ to keep its di_blocks current in case of failure. */
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+ current_time(&ip->i_inode);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ }
+ gfs2_glock_dq_uninit(rd_gh);
+ cond_resched();
+ goto more_rgrps;
+ }
+out:
+ return ret;
+}
+
+/**
+ * find_nonnull_ptr - find a non-null pointer given a metapath and height
+ * assumes the metapath is valid (with buffers) out to height h
+ * @mp: starting metapath
+ * @h: desired height to search
+ *
+ * Returns: true if a non-null pointer was found in the metapath buffer
+ * false if all remaining pointers are NULL in the buffer
+ */
+static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
+ unsigned int h)
+{
+ __be64 *ptr;
+ unsigned int ptrs = hptrs(sdp, h) - 1;
+
+ while (true) {
+ ptr = metapointer(h, mp);
+ if (*ptr) /* if we have a non-null pointer */
+ return true;
+
+ if (mp->mp_list[h] < ptrs)
+ mp->mp_list[h]++;
+ else
+ return false; /* no more pointers in this buffer */
+ }
+}
+
+enum dealloc_states {
+ DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
+ DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
+ DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
+ DEALLOC_DONE = 3, /* process complete */
+};
- if (!size)
+/**
+ * trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct metapath mp;
+ struct buffer_head *dibh, *bh;
+ struct gfs2_holder rd_gh;
+ u64 lblock;
+ __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
+ unsigned int strip_h = ip->i_height - 1;
+ u32 btotal = 0;
+ int ret, state;
+ int mp_h; /* metapath buffers are read in to this height */
+ sector_t last_ra = 0;
+ u64 prev_bnr = 0;
+ bool preserve1; /* need to preserve the first meta pointer? */
+
+ if (!newsize)
lblock = 0;
else
- lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+ lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
+ memset(&mp, 0, sizeof(mp));
find_metapath(sdp, lblock, &mp, ip->i_height);
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
- error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
- if (error)
- return error;
+ memcpy(&nbof, &mp.mp_list, sizeof(nbof));
+
+ ret = gfs2_meta_inode_buffer(ip, &dibh);
+ if (ret)
+ return ret;
- while (height--) {
- struct strip_mine sm;
- sm.sm_first = !!size;
- sm.sm_height = height;
+ mp.mp_bh[0] = dibh;
+ ret = lookup_metapath(ip, &mp);
+ if (ret == ip->i_height)
+ state = DEALLOC_MP_FULL; /* We have a complete metapath */
+ else
+ state = DEALLOC_FILL_MP; /* deal with partial metapath */
- error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
- if (error)
+ ret = gfs2_rindex_update(sdp);
+ if (ret)
+ goto out_metapath;
+
+ ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+ if (ret)
+ goto out_metapath;
+ gfs2_holder_mark_uninitialized(&rd_gh);
+
+ mp_h = strip_h;
+
+ while (state != DEALLOC_DONE) {
+ switch (state) {
+ /* Truncate a full metapath at the given strip height.
+ * Note that strip_h == mp_h in order to be in this state. */
+ case DEALLOC_MP_FULL:
+ if (mp_h > 0) { /* issue read-ahead on metadata */
+ __be64 *top;
+
+ bh = mp.mp_bh[mp_h - 1];
+ if (bh->b_blocknr != last_ra) {
+ last_ra = bh->b_blocknr;
+ top = metaptr1(mp_h - 1, &mp);
+ gfs2_metapath_ra(ip->i_gl, bh, top);
+ }
+ }
+ /* If we're truncating to a non-zero size and the mp is
+ at the beginning of file for the strip height, we
+ need to preserve the first metadata pointer. */
+ preserve1 = (newsize &&
+ (mp.mp_list[mp_h] == nbof[mp_h]));
+ bh = mp.mp_bh[mp_h];
+ gfs2_assert_withdraw(sdp, bh);
+ if (gfs2_assert_withdraw(sdp,
+ prev_bnr != bh->b_blocknr)) {
+ printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
+ "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
+ sdp->sd_fsname,
+ (unsigned long long)ip->i_no_addr,
+ prev_bnr, ip->i_height, strip_h, mp_h);
+ }
+ prev_bnr = bh->b_blocknr;
+ ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
+ mp_h, preserve1);
+ /* If we hit an error or just swept dinode buffer,
+ just exit. */
+ if (ret || !mp_h) {
+ state = DEALLOC_DONE;
+ break;
+ }
+ state = DEALLOC_MP_LOWER;
+ break;
+
+ /* lower the metapath strip height */
+ case DEALLOC_MP_LOWER:
+ /* We're done with the current buffer, so release it,
+ unless it's the dinode buffer. Then back up to the
+ previous pointer. */
+ if (mp_h) {
+ brelse(mp.mp_bh[mp_h]);
+ mp.mp_bh[mp_h] = NULL;
+ }
+ /* If we can't get any lower in height, we've stripped
+ off all we can. Next step is to back up and start
+ stripping the previous level of metadata. */
+ if (mp_h == 0) {
+ strip_h--;
+ memcpy(&mp.mp_list, &nbof, sizeof(nbof));
+ mp_h = strip_h;
+ state = DEALLOC_FILL_MP;
+ break;
+ }
+ mp.mp_list[mp_h] = 0;
+ mp_h--; /* search one metadata height down */
+ if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
+ break; /* loop around in the same state */
+ mp.mp_list[mp_h]++;
+ /* Here we've found a part of the metapath that is not
+ * allocated. We need to search at that height for the
+ * next non-null pointer. */
+ if (find_nonnull_ptr(sdp, &mp, mp_h)) {
+ state = DEALLOC_FILL_MP;
+ mp_h++;
+ }
+ /* No more non-null pointers at this height. Back up
+ to the previous height and try again. */
+ break; /* loop around in the same state */
+
+ /* Fill the metapath with buffers to the given height. */
+ case DEALLOC_FILL_MP:
+ /* Fill the buffers out to the current height. */
+ ret = fillup_metapath(ip, &mp, mp_h);
+ if (ret < 0)
+ goto out;
+
+ /* If buffers found for the entire strip height */
+ if ((ret == ip->i_height) && (mp_h == strip_h)) {
+ state = DEALLOC_MP_FULL;
+ break;
+ }
+ if (ret < ip->i_height) /* We have a partial height */
+ mp_h = ret - 1;
+
+ /* If we find a non-null block pointer, crawl a bit
+ higher up in the metapath and try again, otherwise
+ we need to look lower for a new starting point. */
+ if (find_nonnull_ptr(sdp, &mp, mp_h))
+ mp_h++;
+ else
+ state = DEALLOC_MP_LOWER;
break;
+ }
}
- gfs2_quota_unhold(ip);
+ if (btotal) {
+ if (current->journal_info == NULL) {
+ ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+ RES_QUOTA, 0);
+ if (ret)
+ goto out;
+ down_write(&ip->i_rw_mutex);
+ }
+ gfs2_statfs_change(sdp, 0, +btotal, 0);
+ gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+ ip->i_inode.i_gid);
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ }
- return error;
+out:
+ if (gfs2_holder_initialized(&rd_gh))
+ gfs2_glock_dq_uninit(&rd_gh);
+ if (current->journal_info) {
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ cond_resched();
+ }
+ gfs2_quota_unhold(ip);
+out_metapath:
+ release_metapath(&mp);
+ return ret;
}
static int trunc_end(struct gfs2_inode *ip)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fe2a59c6a9a..c2062a108d19 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -911,11 +911,15 @@ out_qunlock:
static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
- if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+ /* fallocate is needed by gfs2_grow to reserve space in the rindex */
+ if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex)
return -EOPNOTSUPP;
inode_lock(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ec0848fcca02..959a19ced4d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock);
static struct rhashtable_params ht_parms = {
.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
- .key_len = sizeof(struct lm_lockname),
+ .key_len = offsetofend(struct lm_lockname, ln_type),
.key_offset = offsetof(struct gfs2_glock, gl_name),
.head_offset = offsetof(struct gfs2_glock, gl_node),
};
@@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock)
unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+ target != LM_ST_UNLOCKED)
+ return;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
GLOCK_BUG_ON(gl, gl->gl_state == target);
@@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock)
}
else if (ret) {
pr_err("lm_lock ret %d\n", ret);
- GLOCK_BUG_ON(gl, 1);
+ GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
+ &sdp->sd_flags));
}
} else { /* lock_nolock */
finish_xmote(gl, target);
@@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct lm_lockname name = { .ln_number = number,
.ln_type = glops->go_type,
.ln_sbd = sdp };
- struct gfs2_glock *gl, *tmp = NULL;
+ struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
struct kmem_cache *cachep;
- int ret, tries = 0;
+ int ret = 0;
rcu_read_lock();
gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
@@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
again:
- ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
- ht_parms);
- if (ret == 0) {
+ rcu_read_lock();
+ tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
+ ht_parms);
+ if (!tmp) {
*glp = gl;
- return 0;
+ goto out;
}
-
- if (ret == -EEXIST) {
- ret = 0;
- rcu_read_lock();
- tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
- if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
- if (++tries < 100) {
- rcu_read_unlock();
- cond_resched();
- goto again;
- }
- tmp = NULL;
- ret = -ENOMEM;
- }
- rcu_read_unlock();
- } else {
- WARN_ON_ONCE(ret);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto out_free;
}
+ if (lockref_get_not_dead(&tmp->gl_lockref)) {
+ *glp = tmp;
+ goto out_free;
+ }
+ rcu_read_unlock();
+ cond_resched();
+ goto again;
+
+out_free:
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(cachep, gl);
atomic_dec(&sdp->sd_glock_disposal);
- *glp = tmp;
+out:
+ rcu_read_unlock();
return ret;
}
@@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = {
#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)
-static int gfs2_glocks_open(struct inode *inode, struct file *file)
+static int __gfs2_glocks_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
- int ret = seq_open_private(file, &gfs2_glock_seq_ops,
- sizeof(struct gfs2_glock_iter));
+ int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter));
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
@@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+ rhashtable_walk_enter(&gl_hash_table, &gi->hti);
}
return ret;
}
+static int gfs2_glocks_open(struct inode *inode, struct file *file)
+{
+ return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops);
+}
+
static int gfs2_glocks_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
@@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
- int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
- sizeof(struct gfs2_glock_iter));
- if (ret == 0) {
- struct seq_file *seq = file->private_data;
- struct gfs2_glock_iter *gi = seq->private;
- gi->sdp = inode->i_private;
- gi->last_pos = 0;
- seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
- if (seq->buf)
- seq->size = GFS2_SEQ_GOODSIZE;
- gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
- }
- return ret;
+ return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
}
static int gfs2_sbstats_open(struct inode *inode, struct file *file)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 511e1ed7e2de..b7cf65d13561 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -203,11 +203,15 @@ enum {
DFL_DLM_RECOVERY = 6,
};
+/*
+ * We are using struct lm_lockname as an rhashtable key. Avoid holes within
+ * the struct; padding at the end is fine.
+ */
struct lm_lockname {
- struct gfs2_sbd *ln_sbd;
u64 ln_number;
+ struct gfs2_sbd *ln_sbd;
unsigned int ln_type;
-} __packed __aligned(sizeof(int));
+};
#define lm_name_equal(name1, name2) \
(((name1)->ln_number == (name2)->ln_number) && \
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e279c3ce27be..9f605ea4810c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_put:
if (io_gl)
gfs2_glock_put(io_gl);
@@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
ip->i_height = 0;
ip->i_depth = 0;
ip->i_entries = 0;
+ ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */
switch(mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108e7ba81af..ed67548b286c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -23,6 +23,7 @@
#include <linux/quotaops.h>
#include <linux/lockdep.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include "gfs2.h"
#include "incore.h"
@@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
-
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
return 0;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 86ccc0159393..83c9909ff14a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
}
}
-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
- u64 first = rgd->rd_data0;
- u64 last = first + rgd->rd_data;
- return first <= block && block < last;
-}
-
/**
* gfs2_blk2rgrpd - Find resource group for a given data/meta block number
* @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 66b51cf66dfa..e90478e2f545 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+ u64 first = rgd->rd_data0;
+ u64 last = first + rgd->rd_data;
+ return first <= block && block < last;
+}
+
extern void check_and_update_goal(struct gfs2_inode *ip);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 361796a84fce..29b0473f6e74 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
return;
-
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret) {
@@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1617,7 +1617,7 @@ out_unlock:
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_glock_dq(&ip->i_iopen_gh);
}
gfs2_holder_uninit(&ip->i_iopen_gh);
}
@@ -1639,8 +1639,7 @@ out:
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
}
}
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index e33a0d36a93e..5d0182654580 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode)
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
- res = pagecache_write_begin(NULL, mapping, size+1, 0,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ res = pagecache_write_begin(NULL, mapping, size+1, 0, 0,
+ &page, &fsdata);
if (!res) {
res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
page, fsdata);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index feca524ce2a5..a3eb640b4f8f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode)
void *fsdata;
loff_t size = inode->i_size;
- res = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ res = pagecache_write_begin(NULL, mapping, size, 0, 0,
+ &page, &fsdata);
if (res)
return;
res = pagecache_write_end(NULL, mapping, size,
diff --git a/fs/inode.c b/fs/inode.c
index 88110fd0b282..db5914783a71 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -119,7 +119,7 @@ static int no_open(struct inode *inode, struct file *file)
}
/**
- * inode_init_always - perform inode structure intialisation
+ * inode_init_always - perform inode structure initialisation
* @sb: superblock inode belongs to
* @inode: inode to initialise
*
@@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
-#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
-#endif
}
EXPORT_SYMBOL(inode_init_once);
@@ -405,6 +402,8 @@ static void inode_lru_list_add(struct inode *inode)
{
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
+ else
+ inode->i_state |= I_REFERENCED;
}
/*
@@ -1492,7 +1491,6 @@ static void iput_final(struct inode *inode)
drop = generic_drop_inode(inode);
if (!drop && (sb->s_flags & MS_ACTIVE)) {
- inode->i_state |= I_REFERENCED;
inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
diff --git a/fs/internal.h b/fs/internal.h
index 11c6d89dce9c..9676fe11c093 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -108,8 +108,6 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
-extern long do_handle_open(int mountdirfd,
- struct file_handle __user *ufh, int open_flag);
extern int open_check_o_direct(struct file *f);
extern int vfs_open(const struct path *, struct file *, const struct cred *);
extern struct file *filp_clone_open(struct file *);
@@ -128,8 +126,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
return __atime_needs_update(path, inode, true);
}
-extern bool atime_needs_update_rcu(const struct path *, struct inode *);
-
/*
* fs-writeback.c
*/
diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..4b10892967a5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ssize_t written = 0;
unsigned int flags = AOP_FLAG_NOFS;
- /*
- * Copies from kernel address space cannot fail (NFSD is a big user).
- */
- if (!iter_is_iovec(i))
- flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
@@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
return PTR_ERR(rpage);
status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
- &page, iomap);
+ AOP_FLAG_NOFS, &page, iomap);
put_page(rpage);
if (unlikely(status))
return status;
@@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
struct page *page;
int status;
- status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+ status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
+ iomap);
if (status)
return status;
@@ -360,7 +353,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
sector_t sector = iomap->blkno +
(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
- return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+ return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
+ offset, bytes);
}
static loff_t
@@ -887,16 +881,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
flags |= IOMAP_WRITE;
}
- if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(mapping, start, end);
- if (ret)
- goto out_free_dio;
+ ret = filemap_write_and_wait_range(mapping, start, end);
+ if (ret)
+ goto out_free_dio;
- ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
+ ret = invalidate_inode_pages2_range(mapping,
+ start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
inode_dio_begin(inode);
@@ -911,6 +903,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
break;
}
pos += ret;
+
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ break;
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
@@ -951,7 +946,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
*/
- if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+ if (iov_iter_rw(iter) == WRITE) {
int err = invalidate_inode_pages2_range(mapping,
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5adc2fb62b0f..ebad34266bcf 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -206,6 +207,14 @@ static int kjournald2(void *arg)
wake_up(&journal->j_wait_done_commit);
/*
+ * Make sure that no allocations from this kernel thread will ever
+ * recurse to the fs layer because we are responsible for the
+ * transaction commit and any fs involvement might get stuck waiting for
+ * the trasn. commit.
+ */
+ memalloc_nofs_save();
+
+ /*
* And now, wait forever for commit wakeup events.
*/
write_lock(&journal->j_state_lock);
@@ -691,8 +700,21 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
int err = 0;
- jbd2_might_wait_for_commit(journal);
read_lock(&journal->j_state_lock);
+#ifdef CONFIG_PROVE_LOCKING
+ /*
+ * Some callers make sure transaction is already committing and in that
+ * case we cannot block on open handles anymore. So don't warn in that
+ * case.
+ */
+ if (tid_gt(tid, journal->j_commit_sequence) &&
+ (!journal->j_committing_transaction ||
+ journal->j_committing_transaction->t_tid != tid)) {
+ read_unlock(&journal->j_state_lock);
+ jbd2_might_wait_for_commit(journal);
+ read_lock(&journal->j_state_lock);
+ }
+#endif
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
printk(KERN_ERR
@@ -913,7 +935,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
+ REQ_SYNC | REQ_FUA);
if (ret)
goto out;
@@ -1314,7 +1337,7 @@ static int journal_reset(journal_t *journal)
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
journal->j_tail,
- REQ_FUA);
+ REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
@@ -1454,7 +1477,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
sb->s_errno = cpu_to_be32(journal->j_errno);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, REQ_FUA);
+ jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -1721,7 +1744,7 @@ int jbd2_journal_destroy(journal_t *journal)
write_unlock(&journal->j_state_lock);
jbd2_mark_journal_empty(journal,
- REQ_PREFLUSH | REQ_FUA);
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1980,7 +2003,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal, REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2026,7 +2049,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal, REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
@@ -2340,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void)
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
sizeof(struct journal_head),
0, /* offset */
- SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
+ SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
NULL); /* ctor */
retval = 0;
if (!jbd2_journal_head_cache) {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
+#include <linux/sched/mm.h>
#include <trace/events/jbd2.h>
@@ -388,6 +389,11 @@ repeat:
rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
jbd2_journal_free_transaction(new_transaction);
+ /*
+ * Ensure that no allocations done while the transaction is open are
+ * going to recurse back to the fs layer.
+ */
+ handle->saved_alloc_context = memalloc_nofs_save();
return 0;
}
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
line_no, nblocks);
+
return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_rsv_handle)
jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
jbd2_free_handle(handle);
return err;
}
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index fc89f9436784..5c5ac5b3aec3 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -64,7 +64,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case JFS_IOC_GETFLAGS:
- jfs_get_inode_flags(jfs_inode);
flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
flags = jfs_map_ext2(flags, 0);
return put_user(flags, (int __user *) arg);
@@ -98,7 +97,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/* Lock against other parallel changes of flags */
inode_lock(inode);
- jfs_get_inode_flags(jfs_inode);
oldflags = jfs_inode->mode2;
/*
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 6aca224a5d68..f36ef68905a7 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3148,7 +3148,6 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
else
dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns,
jfs_ip->saved_gid));
- jfs_get_inode_flags(jfs_ip);
/*
* mode2 is only needed for storing the higher order bits.
* Trust i_mode for the lower order ones
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 375dd257a34f..5e9b7bb3aabf 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -45,24 +45,6 @@ void jfs_set_inode_flags(struct inode *inode)
S_DIRSYNC | S_SYNC);
}
-void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
-{
- unsigned int flags = jfs_ip->vfs_inode.i_flags;
-
- jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL |
- JFS_DIRSYNC_FL | JFS_SYNC_FL);
- if (flags & S_IMMUTABLE)
- jfs_ip->mode2 |= JFS_IMMUTABLE_FL;
- if (flags & S_APPEND)
- jfs_ip->mode2 |= JFS_APPEND_FL;
- if (flags & S_NOATIME)
- jfs_ip->mode2 |= JFS_NOATIME_FL;
- if (flags & S_DIRSYNC)
- jfs_ip->mode2 |= JFS_DIRSYNC_FL;
- if (flags & S_SYNC)
- jfs_ip->mode2 |= JFS_SYNC_FL;
-}
-
/*
* NAME: ialloc()
*
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9271cfe4a149..7b0b3a40788f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,6 @@ extern void jfs_truncate(struct inode *);
extern void jfs_truncate_nolock(struct inode *, loff_t);
extern void jfs_free_zero_link(struct inode *);
extern struct dentry *jfs_get_parent(struct dentry *dentry);
-extern void jfs_get_inode_flags(struct jfs_inode_info *);
extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type);
extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index c64c2574a0aa..e8aad7d87b8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -45,6 +45,7 @@
#include "jfs_acl.h"
#include "jfs_debug.h"
#include "jfs_xattr.h"
+#include "jfs_dinode.h"
MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
@@ -181,6 +182,35 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+#ifdef CONFIG_QUOTA
+static int jfs_quota_off(struct super_block *sb, int type);
+static int jfs_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path);
+
+static void jfs_quota_off_umount(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ jfs_quota_off(sb, type);
+}
+
+static const struct quotactl_ops jfs_quotactl_ops = {
+ .quota_on = jfs_quota_on,
+ .quota_off = jfs_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_state = dquot_get_state,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
+};
+#else
+static inline void jfs_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
static void jfs_put_super(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -188,7 +218,7 @@ static void jfs_put_super(struct super_block *sb)
jfs_info("In jfs_put_super");
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ jfs_quota_off_umount(sb);
rc = jfs_umount(sb);
if (rc)
@@ -536,7 +566,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = jfs_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
- sb->s_qcop = &dquot_quotactl_ops;
+ sb->s_qcop = &jfs_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
@@ -840,6 +870,51 @@ static struct dquot **jfs_get_dquots(struct inode *inode)
{
return JFS_IP(inode)->i_dquot;
}
+
+static int jfs_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path)
+{
+ int err;
+ struct inode *inode;
+
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ return err;
+
+ inode = d_inode(path->dentry);
+ inode_lock(inode);
+ JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL;
+ inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+ S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+static int jfs_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ int err;
+
+ if (!inode || !igrab(inode))
+ goto out;
+
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL);
+ inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+out_put:
+ iput(inode);
+ return err;
+out:
+ return dquot_quota_off(sb, type);
+}
#endif
static const struct super_operations jfs_super_operations = {
diff --git a/fs/libfs.c b/fs/libfs.c
index a8b62e5d43a9..a04395334bb1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end);
* to pass it an appropriate max_reserved value to avoid collisions.
*/
int simple_fill_super(struct super_block *s, unsigned long magic,
- struct tree_descr *files)
+ const struct tree_descr *files)
{
struct inode *inode;
struct dentry *root;
diff --git a/fs/mount.h b/fs/mount.h
index 2826543a131d..bf1fda6eed8f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -59,7 +59,7 @@ struct mount {
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
#ifdef CONFIG_FSNOTIFY
- struct hlist_head mnt_fsnotify_marks;
+ struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
int mnt_id; /* mount identifier */
diff --git a/fs/namei.c b/fs/namei.c
index 19dcf62133cc..7286f87ce863 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -340,22 +340,14 @@ int generic_permission(struct inode *inode, int mask)
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
- if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
- return 0;
if (!(mask & MAY_WRITE))
if (capable_wrt_inode_uidgid(inode,
CAP_DAC_READ_SEARCH))
return 0;
- return -EACCES;
- }
- /*
- * Read/write DACs are always overridable.
- * Executable DACs are overridable when there is
- * at least one exec bit set.
- */
- if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
return 0;
+ return -EACCES;
+ }
/*
* Searching includes executable on directories, else just read.
@@ -364,6 +356,14 @@ int generic_permission(struct inode *inode, int mask)
if (mask == MAY_READ)
if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
return 0;
+ /*
+ * Read/write DACs are always overridable.
+ * Executable DACs are overridable when there is
+ * at least one exec bit set.
+ */
+ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+ return 0;
return -EACCES;
}
@@ -4766,7 +4766,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
struct page *page;
void *fsdata;
int err;
- unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+ unsigned int flags = 0;
if (nofs)
flags |= AOP_FLAG_NOFS;
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..b3b115bd4e1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
-#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
-#endif
init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
}
return mnt;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d5606099712a..6d0f14c86099 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
sb->s_magic = NCP_SUPER_MAGIC;
sb->s_op = &ncp_sops;
sb->s_d_op = &ncp_dentry_operations;
- sb->s_bdi = &server->bdi;
server = NCP_SBP(sb);
memset(server, 0, sizeof(*server));
- error = bdi_setup_and_register(&server->bdi, "ncpfs");
+ error = super_setup_bdi(sb);
if (error)
goto out_fput;
@@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (data.info_fd != -1) {
struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
if (!info_sock)
- goto out_bdi;
+ goto out_fput;
server->info_sock = info_sock;
error = -EBADFD;
if (info_sock->type != SOCK_STREAM)
@@ -746,8 +745,6 @@ out_nls:
out_fput2:
if (server->info_sock)
sockfd_put(server->info_sock);
-out_bdi:
- bdi_destroy(&server->bdi);
out_fput:
sockfd_put(sock);
out:
@@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb)
kill_pid(server->m.wdog_pid, SIGTERM, 1);
put_pid(server->m.wdog_pid);
- bdi_destroy(&server->bdi);
kfree(server->priv.data);
kfree(server->auth.object_name);
vfree(server->rxbuf);
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 55e26fd80886..366fd63cc506 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -143,7 +143,6 @@ struct ncp_server {
size_t len;
__u8 data[128];
} unexpected_packet;
- struct backing_dev_info bdi;
};
extern void ncp_tcp_rcv_proc(struct work_struct *work);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e0302101e18a..ee5ddbd36088 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -738,9 +738,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
- server->backing_dev_info.name = "nfs";
- server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -887,12 +884,6 @@ struct nfs_server *nfs_alloc_server(void)
return NULL;
}
- if (bdi_init(&server->backing_dev_info)) {
- nfs_free_iostats(server->io_stats);
- kfree(server);
- return NULL;
- }
-
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
@@ -921,7 +912,6 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
nfs_free_iostats(server->io_stats);
- bdi_destroy(&server->backing_dev_info);
kfree(server);
nfs_release_automount_timer();
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 18d0868ee274..6fb9fad2d1e6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -527,7 +527,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
if (put_dreq(dreq))
nfs_direct_complete(dreq);
- return 0;
+ return requested_bytes;
}
/**
@@ -556,7 +556,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
- ssize_t result = -EINVAL;
+ ssize_t result = -EINVAL, requested;
size_t count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
@@ -590,14 +590,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
- result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
+ requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
nfs_end_io_direct(inode);
- if (!result) {
+ if (requested > 0) {
result = nfs_direct_wait(dreq);
- if (result > 0)
+ if (result > 0) {
+ requested -= result;
iocb->ki_pos += result;
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
}
out_release:
@@ -937,7 +942,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
if (put_dreq(dreq))
nfs_direct_write_complete(dreq);
- return 0;
+ return requested_bytes;
}
/**
@@ -962,7 +967,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
- ssize_t result = -EINVAL;
+ ssize_t result = -EINVAL, requested;
size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1005,7 +1010,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
nfs_start_io_direct(inode);
- result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
@@ -1014,13 +1019,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
nfs_end_io_direct(inode);
- if (!result) {
+ if (requested > 0) {
result = nfs_direct_wait(dreq);
if (result > 0) {
+ requested -= result;
iocb->ki_pos = pos + result;
/* XXX: should check the generic_write_sync retval */
generic_write_sync(iocb, result);
}
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
}
out_release:
nfs_direct_req_release(dreq);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..2f3822a4a7d5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2315,8 +2315,6 @@ inline void nfs_initialise_sb(struct super_block *sb)
sb->s_blocksize = nfs_block_bits(server->wsize,
&sb->s_blocksize_bits);
- sb->s_bdi = &server->backing_dev_info;
-
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
@@ -2522,11 +2520,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static int nfs_bdi_register(struct nfs_server *server)
-{
- return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
@@ -2594,11 +2587,13 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
nfs_free_server(server);
server = NULL;
} else {
- error = nfs_bdi_register(server);
+ error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
if (error) {
mntroot = ERR_PTR(error);
goto error_splat_super;
}
+ s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
server->super = s;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 85bfa416fcc8..db7ba542559e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -279,16 +279,15 @@ int nfs_congestion_kb;
static void nfs_set_page_writeback(struct page *page)
{
- struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
int ret = test_set_page_writeback(page);
WARN_ON_ONCE(ret != 0);
if (atomic_long_inc_return(&nfss->writeback) >
- NFS_CONGESTION_ON_THRESH) {
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
+ NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -301,7 +300,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
@@ -1800,7 +1799,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 92b4b41d19d2..fb5213afc854 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
req->cmd[4] = bufflen & 0xff;
req->cmd_len = COMMAND_SIZE(INQUIRY);
- error = blk_execute_rq(rq->q, NULL, rq, 1);
- if (error) {
+ blk_execute_rq(rq->q, NULL, rq, 1);
+ if (req->result) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- rq->errors);
+ req->result);
+ error = -EIO;
goto out_put_request;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index dba2ff8eaa68..452334694a5d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
{
unsigned int len, v, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
+ struct kvec *head = rqstp->rq_arg.head;
+ struct kvec *tail = rqstp->rq_arg.tail;
p = decode_fh(p, &args->fh);
if (!p)
@@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
args->count = ntohl(*p++);
args->stable = ntohl(*p++);
len = args->len = ntohl(*p++);
+ if ((void *)p > head->iov_base + head->iov_len)
+ return 0;
/*
* The count must equal the amount of data passed.
*/
@@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Check to make sure that we got the right number of
* bytes.
*/
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- + rqstp->rq_arg.tail[0].iov_len - hdr;
+ hdr = (void*)p - head->iov_base;
+ dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
/*
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
@@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
len = args->len = max_blocksize;
}
rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
@@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
/* first copy and check from the first page */
old = (char*)p;
vec = &rqstp->rq_arg.head[0];
+ if ((void *)old > vec->iov_base + vec->iov_len)
+ return 0;
avail = vec->iov_len - (old - (char*)vec->iov_base);
while (len && avail && *old) {
*new++ = *old++;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cbeeda1e94a2..d86031b6ad79 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2489,7 +2489,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- if (op->opnum == OP_ILLEGAL)
+ if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp)
return op_encode_hdr_size * sizeof(__be32);
BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 8bf8f667a8cf..6493df6b1bd5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
{
- static struct tree_descr nfsd_files[] = {
+ static const struct tree_descr nfsd_files[] = {
[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 31e1f9593457..59979f0bbd4b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr)
return nfserr;
}
+/*
+ * A write procedure can have a large argument, and a read procedure can
+ * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+ * reply that can both be larger than a page. The xdr code has taken
+ * advantage of this assumption to be a sloppy about bounds checking in
+ * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that
+ * problem, we enforce these assumptions here:
+ */
+static bool nfs_request_too_big(struct svc_rqst *rqstp,
+ struct svc_procedure *proc)
+{
+ /*
+ * The ACL code has more careful bounds-checking and is not
+ * susceptible to this problem:
+ */
+ if (rqstp->rq_prog != NFS_PROGRAM)
+ return false;
+ /*
+ * Ditto NFSv4 (which can in theory have argument and reply both
+ * more than a page):
+ */
+ if (rqstp->rq_vers >= 4)
+ return false;
+ /* The reply will be small, we're OK: */
+ if (proc->pc_xdrressize > 0 &&
+ proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+ return false;
+
+ return rqstp->rq_arg.len > PAGE_SIZE;
+}
+
int
nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
@@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
rqstp->rq_vers, rqstp->rq_proc);
proc = rqstp->rq_procinfo;
+ if (nfs_request_too_big(rqstp, proc)) {
+ dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+ *statp = rpc_garbage_args;
+ return 1;
+ }
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 41b468a6a90f..de07ff625777 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_writeargs *args)
{
unsigned int len, hdr, dlen;
+ struct kvec *head = rqstp->rq_arg.head;
int v;
p = decode_fh(p, &args->fh);
@@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Check to make sure that we got the right number of
* bytes.
*/
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- - hdr;
+ hdr = (void*)p - head->iov_base;
+ if (hdr > head->iov_len)
+ return 0;
+ dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
/*
* Round the length of the data which was specified up to
@@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
return 0;
rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..9aaf6ca77569 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ out_nfserr:
else
err = nfserrno(host_err);
if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
- tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+ current_restore_flags(pflags, PF_LESS_THROTTLE);
return err;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e1872f36147f..926682981d61 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
+ sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 96d3420d0242..3e969ae91b60 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
-obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
- mark.o vfsmount_mark.o fdinfo.o
+obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \
+ fdinfo.o
obj-y += dnotify/
obj-y += inotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5a4ec309e283..2430a0415995 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -52,7 +52,7 @@ struct dnotify_mark {
*/
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
- __u32 new_mask, old_mask;
+ __u32 new_mask = 0;
struct dnotify_struct *dn;
struct dnotify_mark *dn_mark = container_of(fsn_mark,
struct dnotify_mark,
@@ -60,17 +60,13 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
assert_spin_locked(&fsn_mark->lock);
- old_mask = fsn_mark->mask;
- new_mask = 0;
for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
- fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
-
- if (old_mask == new_mask)
+ if (fsn_mark->mask == new_mask)
return;
+ fsn_mark->mask = new_mask;
- if (fsn_mark->inode)
- fsnotify_recalc_inode_mask(fsn_mark->inode);
+ fsnotify_recalc_mask(fsn_mark->connector);
}
/*
@@ -86,7 +82,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
struct dnotify_mark *dn_mark;
struct dnotify_struct *dn;
@@ -138,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
static struct fsnotify_ops dnotify_fsnotify_ops = {
.handle_event = dnotify_handle_event,
+ .free_mark = dnotify_free_mark,
};
/*
@@ -160,7 +158,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
if (!S_ISDIR(inode->i_mode))
return;
- fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
if (!fsn_mark)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -308,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
- fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+ fsnotify_init_mark(new_fsn_mark, dnotify_group);
new_fsn_mark->mask = mask;
new_dn_mark->dn = NULL;
@@ -316,13 +314,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
mutex_lock(&dnotify_group->mark_mutex);
/* add the new_fsn_mark or find an old one. */
- fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
if (fsn_mark) {
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
} else {
- fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
- NULL, 0);
+ fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0);
spin_lock(&new_fsn_mark->lock);
fsn_mark = new_fsn_mark;
dn_mark = new_dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e5f7e47de68e..2fa99aeaa095 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -57,14 +57,26 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
static int fanotify_get_response(struct fsnotify_group *group,
- struct fanotify_perm_event_info *event)
+ struct fanotify_perm_event_info *event,
+ struct fsnotify_iter_info *iter_info)
{
int ret;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+ /*
+ * fsnotify_prepare_user_wait() fails if we race with mark deletion.
+ * Just let the operation pass in that case.
+ */
+ if (!fsnotify_prepare_user_wait(iter_info)) {
+ event->response = FAN_ALLOW;
+ goto out;
+ }
+
wait_event(group->fanotify_data.access_waitq, event->response);
+ fsnotify_finish_user_wait(iter_info);
+out:
/* userspace responded, convert to something usable */
switch (event->response) {
case FAN_ALLOW:
@@ -174,7 +186,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *fanotify_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
int ret = 0;
struct fanotify_event_info *event;
@@ -215,7 +228,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
if (mask & FAN_ALL_PERM_EVENTS) {
- ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
+ ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
+ iter_info);
fsnotify_destroy_event(group, fsn_event);
}
#endif
@@ -248,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
kmem_cache_free(fanotify_event_cachep, event);
}
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+ kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
const struct fsnotify_ops fanotify_fsnotify_ops = {
.handle_event = fanotify_handle_event,
.free_group_priv = fanotify_free_group_priv,
.free_event = fanotify_free_event,
+ .free_mark = fanotify_free_mark,
};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4500a74f8d38..4eb6f5efa282 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,6 +2,7 @@
#include <linux/path.h>
#include <linux/slab.h>
+extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2b37f2785834..907a481ac781 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -41,7 +41,7 @@
extern const struct fsnotify_ops fanotify_fsnotify_ops;
-static struct kmem_cache *fanotify_mark_cache __read_mostly;
+struct kmem_cache *fanotify_mark_cache __read_mostly;
struct kmem_cache *fanotify_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
@@ -295,27 +295,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
}
ret = copy_event_to_user(group, kevent, buf);
+ if (unlikely(ret == -EOPENSTALE)) {
+ /*
+ * We cannot report events with stale fd so drop it.
+ * Setting ret to 0 will continue the event loop and
+ * do the right thing if there are no more events to
+ * read (i.e. return bytes read, -EAGAIN or wait).
+ */
+ ret = 0;
+ }
+
/*
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
fsnotify_destroy_event(group, kevent);
- if (ret < 0)
- break;
} else {
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- if (ret < 0) {
+ if (ret <= 0) {
FANOTIFY_PE(kevent)->response = FAN_DENY;
wake_up(&group->fanotify_data.access_waitq);
- break;
+ } else {
+ spin_lock(&group->notification_lock);
+ list_add_tail(&kevent->list,
+ &group->fanotify_data.access_list);
+ spin_unlock(&group->notification_lock);
}
- spin_lock(&group->notification_lock);
- list_add_tail(&kevent->list,
- &group->fanotify_data.access_list);
- spin_unlock(&group->notification_lock);
#endif
}
+ if (ret < 0)
+ break;
buf += ret;
count -= ret;
}
@@ -445,11 +455,6 @@ static const struct file_operations fanotify_fops = {
.llseek = noop_llseek,
};
-static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
- kmem_cache_free(fanotify_mark_cache, fsn_mark);
-}
-
static int fanotify_find_path(int dfd, const char __user *filename,
struct path *path, unsigned int flags)
{
@@ -511,13 +516,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
tmask &= ~FAN_ONDIR;
oldmask = fsn_mark->mask;
- fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+ fsn_mark->mask = tmask;
} else {
__u32 tmask = fsn_mark->ignored_mask & ~mask;
if (flags & FAN_MARK_ONDIR)
tmask &= ~FAN_ONDIR;
-
- fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+ fsn_mark->ignored_mask = tmask;
}
*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
spin_unlock(&fsn_mark->lock);
@@ -534,7 +538,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
int destroy_mark;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+ fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+ group);
if (!fsn_mark) {
mutex_unlock(&group->mark_mutex);
return -ENOENT;
@@ -542,6 +547,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
+ if (removed & real_mount(mnt)->mnt_fsnotify_mask)
+ fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
@@ -549,9 +556,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
- if (removed & real_mount(mnt)->mnt_fsnotify_mask)
- fsnotify_recalc_vfsmount_mask(mnt);
-
return 0;
}
@@ -564,7 +568,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
int destroy_mark;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_inode_mark(group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark) {
mutex_unlock(&group->mark_mutex);
return -ENOENT;
@@ -572,16 +576,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
+ if (removed & inode->i_fsnotify_mask)
+ fsnotify_recalc_mask(inode->i_fsnotify_marks);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
if (destroy_mark)
fsnotify_free_mark(fsn_mark);
- /* matches the fsnotify_find_inode_mark() */
+ /* matches the fsnotify_find_mark() */
fsnotify_put_mark(fsn_mark);
- if (removed & inode->i_fsnotify_mask)
- fsnotify_recalc_inode_mask(inode);
return 0;
}
@@ -600,13 +604,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
tmask |= FAN_ONDIR;
oldmask = fsn_mark->mask;
- fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+ fsn_mark->mask = tmask;
} else {
__u32 tmask = fsn_mark->ignored_mask | mask;
if (flags & FAN_MARK_ONDIR)
tmask |= FAN_ONDIR;
- fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+ fsn_mark->ignored_mask = tmask;
if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
}
@@ -629,8 +633,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
if (!mark)
return ERR_PTR(-ENOMEM);
- fsnotify_init_mark(mark, fanotify_free_mark);
- ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+ fsnotify_init_mark(mark, group);
+ ret = fsnotify_add_mark_locked(mark, inode, mnt, 0);
if (ret) {
fsnotify_put_mark(mark);
return ERR_PTR(ret);
@@ -648,7 +652,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
__u32 added;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+ fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+ group);
if (!fsn_mark) {
fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
if (IS_ERR(fsn_mark)) {
@@ -657,10 +662,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
}
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- mutex_unlock(&group->mark_mutex);
-
if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
- fsnotify_recalc_vfsmount_mask(mnt);
+ fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
return 0;
@@ -686,7 +690,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
return 0;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_inode_mark(group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark) {
fsn_mark = fanotify_add_new_mark(group, inode, NULL);
if (IS_ERR(fsn_mark)) {
@@ -695,10 +699,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
}
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- mutex_unlock(&group->mark_mutex);
-
if (added & ~inode->i_fsnotify_mask)
- fsnotify_recalc_inode_mask(inode);
+ fsnotify_recalc_mask(inode->i_fsnotify_marks);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
return 0;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index fd98e5100cab..dd63aa9a6f9a 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,12 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
- !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
+ if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE))
return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
- inode = igrab(mark->inode);
+ inode = igrab(mark->connector->inode);
if (inode) {
/*
* IN_ALL_EVENTS represents all of the mask bits
@@ -113,14 +112,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
unsigned int mflags = 0;
struct inode *inode;
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
- return;
-
if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
- if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
- inode = igrab(mark->inode);
+ if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = igrab(mark->connector->inode);
if (!inode)
return;
seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
@@ -129,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
- } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
- struct mount *mnt = real_mount(mark->mnt);
+ } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ struct mount *mnt = real_mount(mark->connector->mnt);
seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index b41515d3f081..01a9f0f007d4 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -41,6 +41,63 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
fsnotify_clear_marks_by_mount(mnt);
}
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
+ * @sb: superblock being unmounted.
+ *
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
+ */
+void fsnotify_unmount_inodes(struct super_block *sb)
+{
+ struct inode *inode, *iput_inode = NULL;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ /*
+ * We cannot __iget() an inode in state I_FREEING,
+ * I_WILL_FREE, or I_NEW which is fine because by that point
+ * the inode cannot have any associated watches.
+ */
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+
+ /*
+ * If i_count is zero, the inode cannot have any watches and
+ * doing an __iget/iput with MS_ACTIVE clear would actually
+ * evict all inodes with zero i_count from icache which is
+ * unnecessarily violent and may in fact be illegal to do.
+ */
+ if (!atomic_read(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+
+ if (iput_inode)
+ iput(iput_inode);
+
+ /* for each watch, send FS_UNMOUNT and then remove it */
+ fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+ fsnotify_inode_delete(inode);
+
+ iput_inode = inode;
+
+ spin_lock(&sb->s_inode_list_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+
+ if (iput_inode)
+ iput(iput_inode);
+}
+
/*
* Given an inode, first check if we care what happens to our children. Inotify
* and dnotify both tell their parents about events. If we care about any event
@@ -127,7 +184,8 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_mark *vfsmount_mark,
__u32 mask, const void *data,
int data_is, u32 cookie,
- const unsigned char *file_name)
+ const unsigned char *file_name,
+ struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *group = NULL;
__u32 inode_test_mask = 0;
@@ -178,7 +236,7 @@ static int send_to_group(struct inode *to_tell,
return group->ops->handle_event(group, to_tell, inode_mark,
vfsmount_mark, mask, data, data_is,
- file_name, cookie);
+ file_name, cookie, iter_info);
}
/*
@@ -193,8 +251,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
struct fsnotify_group *inode_group, *vfsmount_group;
+ struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
+ struct fsnotify_iter_info iter_info;
struct mount *mnt;
- int idx, ret = 0;
+ int ret = 0;
/* global tests shouldn't care about events on child only the specific event */
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
@@ -210,8 +270,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (hlist_empty(&to_tell->i_fsnotify_marks) &&
- (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+ if (!to_tell->i_fsnotify_marks &&
+ (!mnt || !mnt->mnt_fsnotify_marks))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
@@ -223,19 +283,30 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
!(mnt && test_mask & mnt->mnt_fsnotify_mask))
return 0;
- idx = srcu_read_lock(&fsnotify_mark_srcu);
+ iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
if ((mask & FS_MODIFY) ||
- (test_mask & to_tell->i_fsnotify_mask))
- inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ (test_mask & to_tell->i_fsnotify_mask)) {
+ inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
&fsnotify_mark_srcu);
+ if (inode_conn)
+ inode_node = srcu_dereference(inode_conn->list.first,
+ &fsnotify_mark_srcu);
+ }
if (mnt && ((mask & FS_MODIFY) ||
(test_mask & mnt->mnt_fsnotify_mask))) {
- vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
- &fsnotify_mark_srcu);
- inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
&fsnotify_mark_srcu);
+ if (inode_conn)
+ inode_node = srcu_dereference(inode_conn->list.first,
+ &fsnotify_mark_srcu);
+ vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks,
+ &fsnotify_mark_srcu);
+ if (vfsmount_conn)
+ vfsmount_node = srcu_dereference(
+ vfsmount_conn->list.first,
+ &fsnotify_mark_srcu);
}
/*
@@ -272,8 +343,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
vfsmount_mark = NULL;
}
}
+
+ iter_info.inode_mark = inode_mark;
+ iter_info.vfsmount_mark = vfsmount_mark;
+
ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
- data, data_is, cookie, file_name);
+ data, data_is, cookie, file_name,
+ &iter_info);
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
goto out;
@@ -287,12 +363,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
}
ret = 0;
out:
- srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);
return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);
+extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
static __init int fsnotify_init(void)
{
int ret;
@@ -303,6 +381,9 @@ static __init int fsnotify_init(void)
if (ret)
panic("initializing fsnotify_mark_srcu");
+ fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
+ SLAB_PANIC);
+
return 0;
}
core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 0a3bc2cf192c..bf012e8ecd14 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -8,60 +8,36 @@
#include "../mount.h"
+struct fsnotify_iter_info {
+ struct fsnotify_mark *inode_mark;
+ struct fsnotify_mark *vfsmount_mark;
+ int srcu_idx;
+};
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
-/* Calculate mask of events for a list of marks */
-extern u32 fsnotify_recalc_mask(struct hlist_head *head);
-
/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
struct fsnotify_group *b);
-extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
- __u32 mask);
-/* Add mark to a proper place in mark list */
-extern int fsnotify_add_mark_list(struct hlist_head *head,
- struct fsnotify_mark *mark,
- int allow_dups);
-/* add a mark to an inode */
-extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
- int allow_dups);
-/* add a mark to a vfsmount */
-extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct vfsmount *mnt,
- int allow_dups);
-
-/* vfsmount specific destruction of a mark */
-extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
-/* inode specific destruction of a mark */
-extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Find mark belonging to given group in the list of marks */
-extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
- struct fsnotify_group *group);
-/* Destroy all marks in the given list protected by 'lock' */
-extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* Destroy all marks connected via given connector */
+extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp);
/* run the list of all marks associated with inode and destroy them */
static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
{
- fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+ fsnotify_destroy_marks(&inode->i_fsnotify_marks);
}
/* run the list of all marks associated with vfsmount and destroy them */
static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
- fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
- &mnt->mnt_root->d_lock);
+ fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
-/* prepare for freeing all marks associated with given group */
-extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
-/*
- * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
- */
-extern void fsnotify_mark_destroy_list(void);
+/* Wait until all marks queued for destruction are destroyed */
+extern void fsnotify_wait_marks_destroyed(void);
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fbe3cbebec16..32357534de18 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -66,14 +66,23 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
*/
fsnotify_group_stop_queueing(group);
- /* clear all inode marks for this group, attach them to destroy_list */
- fsnotify_detach_group_marks(group);
+ /* Clear all marks for this group and queue them for destruction */
+ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES);
/*
- * Wait for fsnotify_mark_srcu period to end and free all marks in
- * destroy_list
+ * Some marks can still be pinned when waiting for response from
+ * userspace. Wait for those now. fsnotify_prepare_user_wait() will
+ * not succeed now so this wait is race-free.
*/
- fsnotify_mark_destroy_list();
+ wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
+
+ /*
+ * Wait until all marks get really destroyed. We could actually destroy
+ * them ourselves instead of waiting for worker to do it, however that
+ * would be racy as worker can already be processing some marks before
+ * we even entered fsnotify_destroy_group().
+ */
+ fsnotify_wait_marks_destroyed();
/*
* Since we have waited for fsnotify_mark_srcu in
@@ -124,6 +133,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
/* set to 0 when there a no external references to this group */
atomic_set(&group->refcnt, 1);
atomic_set(&group->num_marks, 0);
+ atomic_set(&group->user_waits, 0);
spin_lock_init(&group->notification_lock);
INIT_LIST_HEAD(&group->notification_list);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
deleted file mode 100644
index a3645249f7ec..000000000000
--- a/fs/notify/inode_mark.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-#include "../internal.h"
-
-/*
- * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this inode.
- */
-void fsnotify_recalc_inode_mask(struct inode *inode)
-{
- spin_lock(&inode->i_lock);
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
- spin_unlock(&inode->i_lock);
-
- __fsnotify_update_child_dentry_flags(inode);
-}
-
-void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
-{
- struct inode *inode = mark->inode;
-
- BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&inode->i_lock);
-
- hlist_del_init_rcu(&mark->obj_list);
- mark->inode = NULL;
-
- /*
- * this mark is now off the inode->i_fsnotify_marks list and we
- * hold the inode->i_lock, so this is the perfect time to update the
- * inode->i_fsnotify_mask
- */
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
- spin_unlock(&inode->i_lock);
-}
-
-/*
- * Given a group clear all of the inode marks associated with that group.
- */
-void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
-{
- fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
-}
-
-/*
- * given a group and inode, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
- struct inode *inode)
-{
- struct fsnotify_mark *mark;
-
- spin_lock(&inode->i_lock);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
- spin_unlock(&inode->i_lock);
-
- return mark;
-}
-
-/*
- * If we are setting a mark mask on an inode mark we should pin the inode
- * in memory.
- */
-void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
- __u32 mask)
-{
- struct inode *inode;
-
- assert_spin_locked(&mark->lock);
-
- if (mask &&
- mark->inode &&
- !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
- mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
- inode = igrab(mark->inode);
- /*
- * we shouldn't be able to get here if the inode wasn't
- * already safely held in memory. But bug in case it
- * ever is wrong.
- */
- BUG_ON(!inode);
- }
-}
-
-/*
- * Attach an initialized mark to a given inode.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which group and for which inodes. These
- * marks are ordered according to priority, highest number first, and then by
- * the group's location in memory.
- */
-int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
- int allow_dups)
-{
- int ret;
-
- mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
-
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&inode->i_lock);
- mark->inode = inode;
- ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
- allow_dups);
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
- spin_unlock(&inode->i_lock);
-
- return ret;
-}
-
-/**
- * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-void fsnotify_unmount_inodes(struct super_block *sb)
-{
- struct inode *inode, *iput_inode = NULL;
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- /*
- * We cannot __iget() an inode in state I_FREEING,
- * I_WILL_FREE, or I_NEW which is fine because by that point
- * the inode cannot have any associated watches.
- */
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- /*
- * If i_count is zero, the inode cannot have any watches and
- * doing an __iget/iput with MS_ACTIVE clear would actually
- * evict all inodes with zero i_count from icache which is
- * unnecessarily violent and may in fact be illegal to do.
- */
- if (!atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
-
- if (iput_inode)
- iput(iput_inode);
-
- /* for each watch, send FS_UNMOUNT and then remove it */
- fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-
- fsnotify_inode_delete(inode);
-
- iput_inode = inode;
-
- spin_lock(&sb->s_inode_list_lock);
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- if (iput_inode)
- iput(iput_inode);
-}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7c461fd49c4c..9ff67b61da8a 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -27,9 +27,11 @@ extern int inotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie);
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info);
extern const struct fsnotify_ops inotify_fsnotify_ops;
+extern struct kmem_cache *inotify_inode_mark_cachep;
#ifdef CONFIG_INOTIFY_USER
static inline void dec_inotify_instances(struct ucounts *ucounts)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1aeb837ae414..8b73332735ba 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
struct inotify_inode_mark *i_mark;
struct inotify_event_info *event;
@@ -156,8 +157,8 @@ static int idr_callback(int id, void *p, void *data)
* BUG() that was here.
*/
if (fsn_mark)
- printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
- fsn_mark->group, fsn_mark->inode, i_mark->wd);
+ printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n",
+ fsn_mark->group, i_mark->wd);
return 0;
}
@@ -175,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event)
kfree(INOTIFY_E(fsn_event));
}
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+ struct inotify_inode_mark *i_mark;
+
+ i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+ kmem_cache_free(inotify_inode_mark_cachep, i_mark);
+}
+
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_event = inotify_handle_event,
.free_group_priv = inotify_free_group_priv,
.free_event = inotify_free_event,
.freeing_mark = inotify_freeing_mark,
+ .free_mark = inotify_free_mark,
};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 498d609b26c7..7cc7d3fb1862 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,7 +47,7 @@
/* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_queued_events __read_mostly;
-static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -395,21 +395,6 @@ static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
return i_mark;
}
-static void do_inotify_remove_from_idr(struct fsnotify_group *group,
- struct inotify_inode_mark *i_mark)
-{
- struct idr *idr = &group->inotify_data.idr;
- spinlock_t *idr_lock = &group->inotify_data.idr_lock;
- int wd = i_mark->wd;
-
- assert_spin_locked(idr_lock);
-
- idr_remove(idr, wd);
-
- /* removed from the idr, drop that ref */
- fsnotify_put_mark(&i_mark->fsn_mark);
-}
-
/*
* Remove the mark from the idr (if present) and drop the reference
* on the mark because it was in the idr.
@@ -417,6 +402,7 @@ static void do_inotify_remove_from_idr(struct fsnotify_group *group,
static void inotify_remove_from_idr(struct fsnotify_group *group,
struct inotify_inode_mark *i_mark)
{
+ struct idr *idr = &group->inotify_data.idr;
spinlock_t *idr_lock = &group->inotify_data.idr_lock;
struct inotify_inode_mark *found_i_mark = NULL;
int wd;
@@ -429,18 +415,16 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
* if it wasn't....
*/
if (wd == -1) {
- WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
- " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+ WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+ __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
goto out;
}
/* Lets look in the idr to see if we find it */
found_i_mark = inotify_idr_find_locked(group, wd);
if (unlikely(!found_i_mark)) {
- WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
- " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+ WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+ __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
goto out;
}
@@ -451,35 +435,33 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
*/
if (unlikely(found_i_mark != i_mark)) {
WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
- "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
- "found_i_mark->group=%p found_i_mark->inode=%p\n",
- __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
- i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd,
- found_i_mark->fsn_mark.group,
- found_i_mark->fsn_mark.inode);
+ "found_i_mark=%p found_i_mark->wd=%d "
+ "found_i_mark->group=%p\n", __func__, i_mark,
+ i_mark->wd, i_mark->fsn_mark.group, found_i_mark,
+ found_i_mark->wd, found_i_mark->fsn_mark.group);
goto out;
}
/*
* One ref for being in the idr
- * one ref held by the caller trying to kill us
* one ref grabbed by inotify_idr_find
*/
- if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
- printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
- " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+ if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) {
+ printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+ __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
/* we can't really recover with bad ref cnting.. */
BUG();
}
- do_inotify_remove_from_idr(group, i_mark);
+ idr_remove(idr, wd);
+ /* Removed from the idr, drop that ref. */
+ fsnotify_put_mark(&i_mark->fsn_mark);
out:
+ i_mark->wd = -1;
+ spin_unlock(idr_lock);
/* match the ref taken by inotify_idr_find_locked() */
if (found_i_mark)
fsnotify_put_mark(&found_i_mark->fsn_mark);
- i_mark->wd = -1;
- spin_unlock(idr_lock);
}
/*
@@ -492,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
/* Queue ignore event for the watch */
inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
- NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+ NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL);
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
/* remove this mark from the idr */
@@ -501,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
dec_inotify_watches(group->inotify_data.ucounts);
}
-/* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
- struct inotify_inode_mark *i_mark;
-
- i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
- kmem_cache_free(inotify_inode_mark_cachep, i_mark);
-}
-
static int inotify_update_existing_watch(struct fsnotify_group *group,
struct inode *inode,
u32 arg)
@@ -524,21 +496,19 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
mask = inotify_arg_to_mask(arg);
- fsn_mark = fsnotify_find_inode_mark(group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
-
old_mask = fsn_mark->mask;
if (add)
- fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+ fsn_mark->mask |= mask;
else
- fsnotify_set_mark_mask_locked(fsn_mark, mask);
+ fsn_mark->mask = mask;
new_mask = fsn_mark->mask;
-
spin_unlock(&fsn_mark->lock);
if (old_mask != new_mask) {
@@ -549,7 +519,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* update the inode with this new fsn_mark */
if (dropped || do_inode)
- fsnotify_recalc_inode_mask(inode);
+ fsnotify_recalc_mask(inode->i_fsnotify_marks);
}
@@ -578,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
if (unlikely(!tmp_i_mark))
return -ENOMEM;
- fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+ fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
tmp_i_mark->fsn_mark.mask = mask;
tmp_i_mark->wd = -1;
@@ -594,8 +564,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
}
/* we are on the idr, now get on the inode */
- ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
- NULL, 0);
+ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0);
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_i_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 6043306e8e21..9991f8826734 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -33,7 +33,7 @@
*
* group->mark_mutex
* mark->lock
- * inode->i_lock
+ * mark->connector->lock
*
* group->mark_mutex protects the marks_list anchored inside a given group and
* each mark is hooked via the g_list. It also protects the groups private
@@ -44,14 +44,22 @@
* is assigned to as well as the access to a reference of the inode/vfsmount
* that is being watched by the mark.
*
- * inode->i_lock protects the i_fsnotify_marks list anchored inside a
- * given inode and each mark is hooked via the i_list. (and sorta the
- * free_i_list)
+ * mark->connector->lock protects the list of marks anchored inside an
+ * inode / vfsmount and each mark is hooked via the i_list.
*
+ * A list of notification marks relating to inode / mnt is contained in
+ * fsnotify_mark_connector. That structure is alive as long as there are any
+ * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
+ * detached from fsnotify_mark_connector when last reference to the mark is
+ * dropped. Thus having mark reference is enough to protect mark->connector
+ * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
+ * because we remove mark from g_list before dropping mark reference associated
+ * with that, any mark found through g_list is guaranteed to have
+ * mark->connector set until we drop group->mark_mutex.
*
* LIFETIME:
* Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
+ * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
*
* The inode mark can be cleared for a number of different reasons including:
* - The inode is unlinked for the last time. (fsnotify_inode_remove)
@@ -61,17 +69,6 @@
* - The fsnotify_group associated with the mark is going away and all such marks
* need to be cleaned up. (fsnotify_clear_marks_by_group)
*
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list; we walk i_free_list
- * and before we destroy the mark we make sure that we dont race with a
- * concurrent destroy_group by getting a ref to the marks group and taking the
- * groups mutex.
-
- * Very similarly for freeing by group, except we use free_g_list.
- *
* This has the very interesting property of being able to run concurrently with
* any (or all) other directions.
*/
@@ -94,94 +91,281 @@
#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
struct srcu_struct fsnotify_mark_srcu;
+struct kmem_cache *fsnotify_mark_connector_cachep;
+
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
+static struct fsnotify_mark_connector *connector_destroy_list;
static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
+static void fsnotify_connector_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
+
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
+ WARN_ON_ONCE(!atomic_read(&mark->refcnt));
atomic_inc(&mark->refcnt);
}
-void fsnotify_put_mark(struct fsnotify_mark *mark)
+/*
+ * Get mark reference when we found the mark via lockless traversal of object
+ * list. Mark can be already removed from the list by now and on its way to be
+ * destroyed once SRCU period ends.
+ */
+static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
- if (atomic_dec_and_test(&mark->refcnt)) {
- if (mark->group)
- fsnotify_put_group(mark->group);
- mark->free_mark(mark);
- }
+ return atomic_inc_not_zero(&mark->refcnt);
}
-/* Calculate mask of events for a list of marks */
-u32 fsnotify_recalc_mask(struct hlist_head *head)
+static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
struct fsnotify_mark *mark;
- hlist_for_each_entry(mark, head, obj_list)
- new_mask |= mark->mask;
- return new_mask;
+ assert_spin_locked(&conn->lock);
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
+ if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
+ new_mask |= mark->mask;
+ }
+ if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+ conn->inode->i_fsnotify_mask = new_mask;
+ else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+ real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask;
}
/*
- * Remove mark from inode / vfsmount list, group list, drop inode reference
- * if we got one.
- *
- * Must be called with group->mark_mutex held.
+ * Calculate mask of events for a list of marks. The caller must make sure
+ * connector and connector->inode cannot disappear under us. Callers achieve
+ * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
+ * list.
*/
-void fsnotify_detach_mark(struct fsnotify_mark *mark)
+void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+{
+ if (!conn)
+ return;
+
+ spin_lock(&conn->lock);
+ __fsnotify_recalc_mask(conn);
+ spin_unlock(&conn->lock);
+ if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+ __fsnotify_update_child_dentry_flags(conn->inode);
+}
+
+/* Free all connectors queued for freeing once SRCU period ends */
+static void fsnotify_connector_destroy_workfn(struct work_struct *work)
+{
+ struct fsnotify_mark_connector *conn, *free;
+
+ spin_lock(&destroy_lock);
+ conn = connector_destroy_list;
+ connector_destroy_list = NULL;
+ spin_unlock(&destroy_lock);
+
+ synchronize_srcu(&fsnotify_mark_srcu);
+ while (conn) {
+ free = conn;
+ conn = conn->destroy_next;
+ kmem_cache_free(fsnotify_mark_connector_cachep, free);
+ }
+}
+
+static struct inode *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn)
{
struct inode *inode = NULL;
+
+ if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = conn->inode;
+ rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
+ inode->i_fsnotify_mask = 0;
+ conn->inode = NULL;
+ conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
+ } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks,
+ NULL);
+ real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
+ conn->mnt = NULL;
+ conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+ }
+
+ return inode;
+}
+
+static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
+{
struct fsnotify_group *group = mark->group;
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
+ if (WARN_ON_ONCE(!group))
+ return;
+ group->ops->free_mark(mark);
+ fsnotify_put_group(group);
+}
- spin_lock(&mark->lock);
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_mark_connector *conn;
+ struct inode *inode = NULL;
+ bool free_conn = false;
- /* something else already called this function on this mark */
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
- spin_unlock(&mark->lock);
+ /* Catch marks that were actually never attached to object */
+ if (!mark->connector) {
+ if (atomic_dec_and_test(&mark->refcnt))
+ fsnotify_final_mark_destroy(mark);
return;
}
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
+ /*
+ * We have to be careful so that traversals of obj_list under lock can
+ * safely grab mark reference.
+ */
+ if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+ return;
- if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
- inode = mark->inode;
- fsnotify_destroy_inode_mark(mark);
- } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
- fsnotify_destroy_vfsmount_mark(mark);
- else
- BUG();
+ conn = mark->connector;
+ hlist_del_init_rcu(&mark->obj_list);
+ if (hlist_empty(&conn->list)) {
+ inode = fsnotify_detach_connector_from_object(conn);
+ free_conn = true;
+ } else {
+ __fsnotify_recalc_mask(conn);
+ }
+ mark->connector = NULL;
+ spin_unlock(&conn->lock);
+
+ iput(inode);
+
+ if (free_conn) {
+ spin_lock(&destroy_lock);
+ conn->destroy_next = connector_destroy_list;
+ connector_destroy_list = conn;
+ spin_unlock(&destroy_lock);
+ queue_work(system_unbound_wq, &connector_reaper_work);
+ }
/*
* Note that we didn't update flags telling whether inode cares about
* what's happening with children. We update these flags from
* __fsnotify_parent() lazily when next event happens on one of our
* children.
*/
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+}
- list_del_init(&mark->g_list);
+bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
+{
+ struct fsnotify_group *group;
- spin_unlock(&mark->lock);
+ if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark))
+ return false;
+
+ if (iter_info->inode_mark)
+ group = iter_info->inode_mark->group;
+ else
+ group = iter_info->vfsmount_mark->group;
+
+ /*
+ * Since acquisition of mark reference is an atomic op as well, we can
+ * be sure this inc is seen before any effect of refcount increment.
+ */
+ atomic_inc(&group->user_waits);
+
+ if (iter_info->inode_mark) {
+ /* This can fail if mark is being removed */
+ if (!fsnotify_get_mark_safe(iter_info->inode_mark))
+ goto out_wait;
+ }
+ if (iter_info->vfsmount_mark) {
+ if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark))
+ goto out_inode;
+ }
- if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
- iput(inode);
+ /*
+ * Now that both marks are pinned by refcount in the inode / vfsmount
+ * lists, we can drop SRCU lock, and safely resume the list iteration
+ * once userspace returns.
+ */
+ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);
+
+ return true;
+out_inode:
+ if (iter_info->inode_mark)
+ fsnotify_put_mark(iter_info->inode_mark);
+out_wait:
+ if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+ wake_up(&group->notification_waitq);
+ return false;
+}
+
+void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
+{
+ struct fsnotify_group *group = NULL;
+
+ iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
+ if (iter_info->inode_mark) {
+ group = iter_info->inode_mark->group;
+ fsnotify_put_mark(iter_info->inode_mark);
+ }
+ if (iter_info->vfsmount_mark) {
+ group = iter_info->vfsmount_mark->group;
+ fsnotify_put_mark(iter_info->vfsmount_mark);
+ }
+ /*
+ * We abuse notification_waitq on group shutdown for waiting for all
+ * marks pinned when waiting for userspace.
+ */
+ if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+ wake_up(&group->notification_waitq);
+}
+
+/*
+ * Mark mark as detached, remove it from group list. Mark still stays in object
+ * list until its last reference is dropped. Note that we rely on mark being
+ * removed from group list before corresponding reference to it is dropped. In
+ * particular we rely on mark->connector being valid while we hold
+ * group->mark_mutex if we found the mark through g_list.
+ *
+ * Must be called with group->mark_mutex held. The caller must either hold
+ * reference to the mark or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_group *group = mark->group;
+
+ WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
+ WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
+ atomic_read(&mark->refcnt) < 1 +
+ !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
+
+ spin_lock(&mark->lock);
+ /* something else already called this function on this mark */
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
+ spin_unlock(&mark->lock);
+ return;
+ }
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
+ list_del_init(&mark->g_list);
+ spin_unlock(&mark->lock);
atomic_dec(&group->num_marks);
+
+ /* Drop mark reference acquired in fsnotify_add_mark_locked() */
+ fsnotify_put_mark(mark);
}
/*
- * Prepare mark for freeing and add it to the list of marks prepared for
- * freeing. The actual freeing must happen after SRCU period ends and the
- * caller is responsible for this.
+ * Free fsnotify mark. The mark is actually only marked as being freed. The
+ * freeing is actually happening only once last reference to the mark is
+ * dropped from a workqueue which first waits for srcu period end.
*
- * The function returns true if the mark was added to the list of marks for
- * freeing. The function returns false if someone else has already called
- * __fsnotify_free_mark() for the mark.
+ * Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
*/
-static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
+void fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -189,7 +373,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return false;
+ return;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
@@ -201,25 +385,6 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
-
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
-
- return true;
-}
-
-/*
- * Free fsnotify mark. The freeing is actually happening from a workqueue which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
- */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
-{
- if (__fsnotify_free_mark(mark)) {
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
- }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -231,54 +396,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
fsnotify_free_mark(mark);
}
-void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
-{
- struct fsnotify_mark *mark;
-
- while (1) {
- /*
- * We have to be careful since we can race with e.g.
- * fsnotify_clear_marks_by_group() and once we drop 'lock',
- * mark can get removed from the obj_list and destroyed. But
- * we are holding mark reference so mark cannot be freed and
- * calling fsnotify_destroy_mark() more than once is fine.
- */
- spin_lock(lock);
- if (hlist_empty(head)) {
- spin_unlock(lock);
- break;
- }
- mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
- /*
- * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
- * since inode / mount is going away anyway. So just remove
- * mark from the list.
- */
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- spin_unlock(lock);
- fsnotify_destroy_mark(mark, mark->group);
- fsnotify_put_mark(mark);
- }
-}
-
-void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
- assert_spin_locked(&mark->lock);
-
- mark->mask = mask;
-
- if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
- fsnotify_set_inode_mark_mask_locked(mark, mask);
-}
-
-void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
- assert_spin_locked(&mark->lock);
-
- mark->ignored_mask = mask;
-}
-
/*
* Sorting function for lists of fsnotify marks.
*
@@ -315,37 +432,133 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
return -1;
}
-/* Add mark into proper place in given list of marks */
-int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
- int allow_dups)
+static int fsnotify_attach_connector_to_object(
+ struct fsnotify_mark_connector __rcu **connp,
+ struct inode *inode,
+ struct vfsmount *mnt)
+{
+ struct fsnotify_mark_connector *conn;
+
+ conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
+ if (!conn)
+ return -ENOMEM;
+ spin_lock_init(&conn->lock);
+ INIT_HLIST_HEAD(&conn->list);
+ if (inode) {
+ conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
+ conn->inode = igrab(inode);
+ } else {
+ conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+ conn->mnt = mnt;
+ }
+ /*
+ * cmpxchg() provides the barrier so that readers of *connp can see
+ * only initialized structure
+ */
+ if (cmpxchg(connp, NULL, conn)) {
+ /* Someone else created list structure for us */
+ if (inode)
+ iput(inode);
+ kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+ }
+
+ return 0;
+}
+
+/*
+ * Get mark connector, make sure it is alive and return with its lock held.
+ * This is for users that get connector pointer from inode or mount. Users that
+ * hold reference to a mark on the list may directly lock connector->lock as
+ * they are sure list cannot go away under them.
+ */
+static struct fsnotify_mark_connector *fsnotify_grab_connector(
+ struct fsnotify_mark_connector __rcu **connp)
+{
+ struct fsnotify_mark_connector *conn;
+ int idx;
+
+ idx = srcu_read_lock(&fsnotify_mark_srcu);
+ conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
+ if (!conn)
+ goto out;
+ spin_lock(&conn->lock);
+ if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
+ spin_unlock(&conn->lock);
+ srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ return NULL;
+ }
+out:
+ srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ return conn;
+}
+
+/*
+ * Add mark into proper place in given list of marks. These marks may be used
+ * for the fsnotify backend to determine which event types should be delivered
+ * to which group and for which inodes. These marks are ordered according to
+ * priority, highest number first, and then by the group's location in memory.
+ */
+static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+ struct inode *inode, struct vfsmount *mnt,
+ int allow_dups)
{
struct fsnotify_mark *lmark, *last = NULL;
+ struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark_connector __rcu **connp;
int cmp;
+ int err = 0;
+
+ if (WARN_ON(!inode && !mnt))
+ return -EINVAL;
+ if (inode)
+ connp = &inode->i_fsnotify_marks;
+ else
+ connp = &real_mount(mnt)->mnt_fsnotify_marks;
+restart:
+ spin_lock(&mark->lock);
+ conn = fsnotify_grab_connector(connp);
+ if (!conn) {
+ spin_unlock(&mark->lock);
+ err = fsnotify_attach_connector_to_object(connp, inode, mnt);
+ if (err)
+ return err;
+ goto restart;
+ }
/* is mark the first mark? */
- if (hlist_empty(head)) {
- hlist_add_head_rcu(&mark->obj_list, head);
- return 0;
+ if (hlist_empty(&conn->list)) {
+ hlist_add_head_rcu(&mark->obj_list, &conn->list);
+ goto added;
}
/* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, head, obj_list) {
+ hlist_for_each_entry(lmark, &conn->list, obj_list) {
last = lmark;
- if ((lmark->group == mark->group) && !allow_dups)
- return -EEXIST;
+ if ((lmark->group == mark->group) &&
+ (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
+ !allow_dups) {
+ err = -EEXIST;
+ goto out_err;
+ }
cmp = fsnotify_compare_groups(lmark->group, mark->group);
if (cmp >= 0) {
hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
- return 0;
+ goto added;
}
}
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
- return 0;
+added:
+ mark->connector = conn;
+out_err:
+ spin_unlock(&conn->lock);
+ spin_unlock(&mark->lock);
+ return err;
}
/*
@@ -353,10 +566,10 @@ int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
*/
-int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode,
struct vfsmount *mnt, int allow_dups)
{
+ struct fsnotify_group *group = mark->group;
int ret = 0;
BUG_ON(inode && mnt);
@@ -367,61 +580,42 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
* LOCKING ORDER!!!!
* group->mark_mutex
* mark->lock
- * inode->i_lock
+ * mark->connector->lock
*/
spin_lock(&mark->lock);
mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
- fsnotify_get_group(group);
- mark->group = group;
list_add(&mark->g_list, &group->marks_list);
atomic_inc(&group->num_marks);
- fsnotify_get_mark(mark); /* for i_list and g_list */
-
- if (inode) {
- ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
- if (ret)
- goto err;
- } else if (mnt) {
- ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
- if (ret)
- goto err;
- } else {
- BUG();
- }
-
- /* this will pin the object if appropriate */
- fsnotify_set_mark_mask_locked(mark, mark->mask);
+ fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- if (inode)
- __fsnotify_update_child_dentry_flags(inode);
+ ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups);
+ if (ret)
+ goto err;
+
+ if (mark->mask)
+ fsnotify_recalc_mask(mark->connector);
return ret;
err:
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
+ FSNOTIFY_MARK_FLAG_ATTACHED);
list_del_init(&mark->g_list);
- fsnotify_put_group(group);
- mark->group = NULL;
atomic_dec(&group->num_marks);
- spin_unlock(&mark->lock);
-
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
+ fsnotify_put_mark(mark);
return ret;
}
-int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
- struct inode *inode, struct vfsmount *mnt, int allow_dups)
+int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode,
+ struct vfsmount *mnt, int allow_dups)
{
int ret;
+ struct fsnotify_group *group = mark->group;
+
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
+ ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups);
mutex_unlock(&group->mark_mutex);
return ret;
}
@@ -430,29 +624,42 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
* Given a list of marks, find the mark associated with given group. If found
* take a reference to that mark and return it, else return NULL.
*/
-struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
- struct fsnotify_group *group)
+struct fsnotify_mark *fsnotify_find_mark(
+ struct fsnotify_mark_connector __rcu **connp,
+ struct fsnotify_group *group)
{
+ struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark;
- hlist_for_each_entry(mark, head, obj_list) {
- if (mark->group == group) {
+ conn = fsnotify_grab_connector(connp);
+ if (!conn)
+ return NULL;
+
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
+ if (mark->group == group &&
+ (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
fsnotify_get_mark(mark);
+ spin_unlock(&conn->lock);
return mark;
}
}
+ spin_unlock(&conn->lock);
return NULL;
}
-/*
- * clear any marks in a group in which mark->flags & flags is true
- */
-void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
- unsigned int flags)
+/* Clear any marks in a group with given type */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+ unsigned int type)
{
struct fsnotify_mark *lmark, *mark;
LIST_HEAD(to_free);
+ struct list_head *head = &to_free;
+ /* Skip selection step if we want to clear all marks. */
+ if (type == FSNOTIFY_OBJ_ALL_TYPES) {
+ head = &group->marks_list;
+ goto clear;
+ }
/*
* We have to be really careful here. Anytime we drop mark_mutex, e.g.
* fsnotify_clear_marks_by_inode() can come and free marks. Even in our
@@ -464,18 +671,19 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
*/
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
- if (mark->flags & flags)
+ if (mark->connector->flags & type)
list_move(&mark->g_list, &to_free);
}
mutex_unlock(&group->mark_mutex);
+clear:
while (1) {
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- if (list_empty(&to_free)) {
+ if (list_empty(head)) {
mutex_unlock(&group->mark_mutex);
break;
}
- mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
+ mark = list_first_entry(head, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
@@ -484,49 +692,62 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
}
-/*
- * Given a group, prepare for freeing all the marks associated with that group.
- * The marks are attached to the list of marks prepared for destruction, the
- * caller is responsible for freeing marks in that list after SRCU period has
- * ended.
- */
-void fsnotify_detach_group_marks(struct fsnotify_group *group)
+/* Destroy all marks attached to inode / vfsmount */
+void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
{
- struct fsnotify_mark *mark;
+ struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark *mark, *old_mark = NULL;
+ struct inode *inode;
- while (1) {
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- if (list_empty(&group->marks_list)) {
- mutex_unlock(&group->mark_mutex);
- break;
- }
- mark = list_first_entry(&group->marks_list,
- struct fsnotify_mark, g_list);
+ conn = fsnotify_grab_connector(connp);
+ if (!conn)
+ return;
+ /*
+ * We have to be careful since we can race with e.g.
+ * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
+ * list can get modified. However we are holding mark reference and
+ * thus our mark cannot be removed from obj_list so we can continue
+ * iteration after regaining conn->lock.
+ */
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
fsnotify_get_mark(mark);
- fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
- __fsnotify_free_mark(mark);
- fsnotify_put_mark(mark);
+ spin_unlock(&conn->lock);
+ if (old_mark)
+ fsnotify_put_mark(old_mark);
+ old_mark = mark;
+ fsnotify_destroy_mark(mark, mark->group);
+ spin_lock(&conn->lock);
}
+ /*
+ * Detach list from object now so that we don't pin inode until all
+ * mark references get dropped. It would lead to strange results such
+ * as delaying inode deletion or blocking unmount.
+ */
+ inode = fsnotify_detach_connector_from_object(conn);
+ spin_unlock(&conn->lock);
+ if (old_mark)
+ fsnotify_put_mark(old_mark);
+ iput(inode);
}
/*
* Nothing fancy, just initialize lists and locks and counters.
*/
void fsnotify_init_mark(struct fsnotify_mark *mark,
- void (*free_mark)(struct fsnotify_mark *mark))
+ struct fsnotify_group *group)
{
memset(mark, 0, sizeof(*mark));
spin_lock_init(&mark->lock);
atomic_set(&mark->refcnt, 1);
- mark->free_mark = free_mark;
+ fsnotify_get_group(group);
+ mark->group = group;
}
/*
* Destroy all marks in destroy_list, waits for SRCU period to finish before
* actually freeing marks.
*/
-void fsnotify_mark_destroy_list(void)
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -540,11 +761,12 @@ void fsnotify_mark_destroy_list(void)
list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
+ fsnotify_final_mark_destroy(mark);
}
}
-static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+/* Wait for all marks queued for destruction to be actually destroyed */
+void fsnotify_wait_marks_destroyed(void)
{
- fsnotify_mark_destroy_list();
+ flush_delayed_work(&reaper_work);
}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
deleted file mode 100644
index a8fcab68faef..000000000000
--- a/fs/notify/vfsmount_mark.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
-{
- fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
-}
-
-/*
- * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this mount point
- */
-void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
-{
- struct mount *m = real_mount(mnt);
-
- spin_lock(&mnt->mnt_root->d_lock);
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
- spin_unlock(&mnt->mnt_root->d_lock);
-}
-
-void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
-{
- struct vfsmount *mnt = mark->mnt;
- struct mount *m = real_mount(mnt);
-
- BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&mnt->mnt_root->d_lock);
-
- hlist_del_init_rcu(&mark->obj_list);
- mark->mnt = NULL;
-
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
- spin_unlock(&mnt->mnt_root->d_lock);
-}
-
-/*
- * given a group and vfsmount, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
- struct vfsmount *mnt)
-{
- struct mount *m = real_mount(mnt);
- struct fsnotify_mark *mark;
-
- spin_lock(&mnt->mnt_root->d_lock);
- mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
- spin_unlock(&mnt->mnt_root->d_lock);
-
- return mark;
-}
-
-/*
- * Attach an initialized mark to a given group and vfsmount.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which groups.
- */
-int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct vfsmount *mnt,
- int allow_dups)
-{
- struct mount *m = real_mount(mnt);
- int ret;
-
- mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
-
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&mnt->mnt_root->d_lock);
- mark->mnt = mnt;
- ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
- spin_unlock(&mnt->mnt_root->d_lock);
-
- return ret;
-}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 1656843e87d2..f3db56e83dd2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -91,6 +91,7 @@ slow:
return ERR_PTR(-ENOMEM);
}
d_instantiate(dentry, inode);
+ dentry->d_flags |= DCACHE_RCUACCESS;
dentry->d_fsdata = (void *)ns->ops;
d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
if (d) {
@@ -195,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
{
struct ns_common *ns;
int res = -ENOENT;
+ const char *name;
ns = ns_ops->get(task);
if (ns) {
- res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+ name = ns_ops->real_ns_name ? : ns_ops->name;
+ res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
ns_ops->put(ns);
}
return res;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f6e871760f8d..0da0332725aa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2242,13 +2242,13 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
const char *page, size_t count)
{
unsigned long tmp;
@@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
}
-CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold,
+ &o2hb_heartbeat_group_attr_dead_threshold,
&o2hb_heartbeat_group_attr_mode,
NULL,
};
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d0ab7e56d0b4..8d779227370a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
- init_timer(&sc->sc_idle_timeout);
- sc->sc_idle_timeout.function = o2net_idle_timer;
- sc->sc_idle_timeout.data = (unsigned long)sc;
+ setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
+ (unsigned long)sc);
sclog(sc, "alloced\n");
@@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
mutex_lock(&sc->sc_send_lock);
ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
virt_to_page(kmalloced_virt),
- (long)kmalloced_virt & ~PAGE_MASK,
+ offset_in_page(kmalloced_virt),
size, MSG_DONTWAIT);
mutex_unlock(&sc->sc_send_lock);
if (ret == size)
@@ -1460,27 +1459,10 @@ static void o2net_rx_until_empty(struct work_struct *work)
static int o2net_set_nodelay(struct socket *sock)
{
- int ret, val = 1;
- mm_segment_t oldfs;
+ int val = 1;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
-
- /*
- * Dear unsuspecting programmer,
- *
- * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
- * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
- * silently turn into SO_DEBUG.
- *
- * Yours,
- * Keeper of hilariously fragile interfaces.
- */
- ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char __user *)&val, sizeof(val));
-
- set_fs(oldfs);
- return ret;
+ return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (void *)&val, sizeof(val));
}
static int o2net_set_usertimeout(struct socket *sock)
@@ -1488,7 +1470,7 @@ static int o2net_set_usertimeout(struct socket *sock)
int user_timeout = O2NET_TCP_USER_TIMEOUT;
return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
- (char *)&user_timeout, sizeof(user_timeout));
+ (void *)&user_timeout, sizeof(user_timeout));
}
static void o2net_initialize_handshake(void)
diff --git a/fs/open.c b/fs/open.c
index 949cef29c3bb..6d2d2b33ac54 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -193,7 +193,8 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
goto out_putf;
error = -EPERM;
- if (IS_APPEND(inode))
+ /* Check IS_APPEND on real upper inode */
+ if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
@@ -900,6 +901,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
+ /*
+ * Clear out all open flags we don't know about so that we don't report
+ * them in fcntl(F_GETFD) or similar interfaces.
+ */
+ flags &= VALID_OPEN_FLAGS;
+
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
else
@@ -1078,6 +1085,26 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
return do_sys_open(dfd, filename, flags, mode);
}
+#ifdef CONFIG_COMPAT
+/*
+ * Exactly like sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+{
+ return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
+{
+ return do_sys_open(dfd, filename, flags, mode);
+}
+#endif
+
#ifndef __alpha__
/*
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index e1534c9bab16..c19f0787c9c6 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file,
return -EINVAL;
}
+ /* Check for an empty list before locking. */
+ if (list_empty(&orangefs_request_list))
+ return -EAGAIN;
+
restart:
/* Get next op (if any) from top of list. */
spin_lock(&orangefs_request_list_lock);
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 284373a57a08..d327cbd17756 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -1,396 +1,404 @@
/*
- * (C) 2001 Clemson University and The University of Chicago
- *
- * See COPYING in top-level directory.
+ * Copyright 2017 Omnibond Systems, L.L.C.
*/
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
+struct orangefs_dir_part {
+ struct orangefs_dir_part *next;
+ size_t len;
+};
+
+struct orangefs_dir {
+ __u64 token;
+ struct orangefs_dir_part *part;
+ loff_t end;
+ int error;
+};
+
+#define PART_SHIFT (24)
+#define PART_SIZE (1<<24)
+#define PART_MASK (~(PART_SIZE - 1))
+
/*
- * decode routine used by kmod to deal with the blob sent from
- * userspace for readdirs. The blob contains zero or more of these
- * sub-blobs:
- * __u32 - represents length of the character string that follows.
- * string - between 1 and ORANGEFS_NAME_MAX bytes long.
- * padding - (if needed) to cause the __u32 plus the string to be
- * eight byte aligned.
- * khandle - sizeof(khandle) bytes.
+ * There can be up to 512 directory entries. Each entry is encoded as
+ * follows:
+ * 4 bytes: string size (n)
+ * n bytes: string
+ * 1 byte: trailing zero
+ * padding to 8 bytes
+ * 16 bytes: khandle
+ * padding to 8 bytes
+ *
+ * The trailer_buf starts with a struct orangefs_readdir_response_s
+ * which must be skipped to get to the directory data.
+ *
+ * The data which is received from the userspace daemon is termed a
+ * part and is stored in a linked list in case more than one part is
+ * needed for a large directory.
+ *
+ * The position pointer (ctx->pos) encodes the part and offset on which
+ * to begin reading at. Bits above PART_SHIFT encode the part and bits
+ * below PART_SHIFT encode the offset. Parts are stored in a linked
+ * list which grows as data is received from the server. The overhead
+ * associated with managing the list is presumed to be small compared to
+ * the overhead of communicating with the server.
+ *
+ * As data is received from the server, it is placed at the end of the
+ * part list. Data is parsed from the current position as it is needed.
+ * When data is determined to be corrupt, it is either because the
+ * userspace component has sent back corrupt data or because the file
+ * pointer has been moved to an invalid location. Since the two cannot
+ * be differentiated, return EIO.
+ *
+ * Part zero is synthesized to contains `.' and `..'. Part one is the
+ * first part of the part list.
*/
-static long decode_dirents(char *ptr, size_t size,
- struct orangefs_readdir_response_s *readdir)
+
+static int do_readdir(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct orangefs_kernel_op_s *op)
{
- int i;
- struct orangefs_readdir_response_s *rd =
- (struct orangefs_readdir_response_s *) ptr;
- char *buf = ptr;
- int khandle_size = sizeof(struct orangefs_khandle);
- size_t offset = offsetof(struct orangefs_readdir_response_s,
- dirent_array);
- /* 8 reflects eight byte alignment */
- int smallest_blob = khandle_size + 8;
- __u32 len;
- int aligned_len;
- int sizeof_u32 = sizeof(__u32);
- long ret;
-
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
-
- /* size is = offset on empty dirs, > offset on non-empty dirs... */
- if (size < offset) {
- gossip_err("%s: size:%zu: offset:%zu:\n",
- __func__,
- size,
- offset);
- ret = -EINVAL;
- goto out;
- }
+ struct orangefs_readdir_response_s *resp;
+ int bufi, r;
- if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
- gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
- __func__,
- size,
- readdir->orangefs_dirent_outcount);
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * Despite the badly named field, readdir does not use shared
+ * memory. However, there are a limited number of readdir
+ * slots, which must be allocated here. This flag simply tells
+ * the op scheduler to return the op here for retry.
+ */
+ op->uses_shared_memory = 1;
+ op->upcall.req.readdir.refn = oi->refn;
+ op->upcall.req.readdir.token = od->token;
+ op->upcall.req.readdir.max_dirent_count =
+ ORANGEFS_MAX_DIRENT_COUNT_READDIR;
- readdir->token = rd->token;
- readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
- readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
- sizeof(*readdir->dirent_array),
- GFP_KERNEL);
- if (readdir->dirent_array == NULL) {
- gossip_err("%s: kcalloc failed.\n", __func__);
- ret = -ENOMEM;
- goto out;
+again:
+ bufi = orangefs_readdir_index_get();
+ if (bufi < 0) {
+ od->error = bufi;
+ return bufi;
}
- buf += offset;
- size -= offset;
-
- for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
- if (size < smallest_blob) {
- gossip_err("%s: size:%zu: smallest_blob:%d:\n",
- __func__,
- size,
- smallest_blob);
- ret = -EINVAL;
- goto free;
- }
+ op->upcall.req.readdir.buf_index = bufi;
- len = *(__u32 *)buf;
- if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
- gossip_err("%s: len:%d:\n", __func__, len);
- ret = -EINVAL;
- goto free;
- }
+ r = service_operation(op, "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: size:%zu: len:%d:\n",
- __func__,
- size,
- len);
+ orangefs_readdir_index_put(bufi);
- readdir->dirent_array[i].d_name = buf + sizeof_u32;
- readdir->dirent_array[i].d_length = len;
+ if (op_state_purged(op)) {
+ if (r == -EAGAIN) {
+ vfree(op->downcall.trailer_buf);
+ goto again;
+ } else if (r == -EIO) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ }
+ }
- /*
- * Calculate "aligned" length of this string and its
- * associated __u32 descriptor.
- */
- aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: aligned_len:%d:\n",
- __func__,
- aligned_len);
+ if (r < 0) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ } else if (op->downcall.status) {
+ vfree(op->downcall.trailer_buf);
+ od->error = op->downcall.status;
+ return op->downcall.status;
+ }
- /*
- * The end of the blob should coincide with the end
- * of the last sub-blob.
- */
- if (size < aligned_len + khandle_size) {
- gossip_err("%s: ran off the end of the blob.\n",
- __func__);
- ret = -EINVAL;
- goto free;
- }
- size -= aligned_len + khandle_size;
+ /*
+ * The maximum size is size per entry times the 512 entries plus
+ * the header. This is well under the limit.
+ */
+ if (op->downcall.trailer_size > PART_SIZE) {
+ vfree(op->downcall.trailer_buf);
+ od->error = -EIO;
+ return -EIO;
+ }
- buf += aligned_len;
+ resp = (struct orangefs_readdir_response_s *)
+ op->downcall.trailer_buf;
+ od->token = resp->token;
+ return 0;
+}
- readdir->dirent_array[i].khandle =
- *(struct orangefs_khandle *) buf;
- buf += khandle_size;
+static int parse_readdir(struct orangefs_dir *od,
+ struct orangefs_kernel_op_s *op)
+{
+ struct orangefs_dir_part *part, *new;
+ size_t count;
+
+ count = 1;
+ part = od->part;
+ while (part) {
+ count++;
+ if (part->next)
+ part = part->next;
+ else
+ break;
}
- ret = buf - ptr;
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
- goto out;
-free:
- kfree(readdir->dirent_array);
- readdir->dirent_array = NULL;
+ new = (void *)op->downcall.trailer_buf;
+ new->next = NULL;
+ new->len = op->downcall.trailer_size -
+ sizeof(struct orangefs_readdir_response_s);
+ if (!od->part)
+ od->part = new;
+ else
+ part->next = new;
+ count++;
+ od->end = count << PART_SHIFT;
-out:
- return ret;
+ return 0;
}
-/*
- * Read directory entries from an instance of an open directory.
- */
-static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
{
- int ret = 0;
- int buffer_index;
- /*
- * ptoken supports Orangefs' distributed directory logic, added
- * in 2.9.2.
- */
- __u64 *ptoken = file->private_data;
- __u64 pos = 0;
- ino_t ino = 0;
- struct dentry *dentry = file->f_path.dentry;
- struct orangefs_kernel_op_s *new_op = NULL;
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
- struct orangefs_readdir_response_s readdir_response;
- void *dents_buf;
- int i = 0;
- int len = 0;
- ino_t current_ino = 0;
- char *current_entry = NULL;
- long bytes_decoded;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld, ptoken = %llu\n",
- __func__,
- lld(ctx->pos),
- llu(*ptoken));
-
- pos = (__u64) ctx->pos;
-
- /* are we done? */
- if (pos == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Skipping to termination path\n");
- return 0;
+ struct orangefs_kernel_op_s *op;
+ int r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
+ return -ENOMEM;
+ }
+ r = do_readdir(oi, od, dentry, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+ r = parse_readdir(od, op);
+ if (r) {
+ od->error = r;
+ goto out;
}
- gossip_debug(GOSSIP_DIR_DEBUG,
- "orangefs_readdir called on %pd (pos=%llu)\n",
- dentry, llu(pos));
+ od->error = 0;
+out:
+ op_release(op);
+ return od->error;
+}
- memset(&readdir_response, 0, sizeof(readdir_response));
+static int fill_from_part(struct orangefs_dir_part *part,
+ struct dir_context *ctx)
+{
+ const int offset = sizeof(struct orangefs_readdir_response_s);
+ struct orangefs_khandle *khandle;
+ __u32 *len, padlen;
+ loff_t i;
+ char *s;
+ i = ctx->pos & ~PART_MASK;
- new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
- if (!new_op)
- return -ENOMEM;
+ /* The file offset from userspace is too large. */
+ if (i > part->len)
+ return 1;
/*
- * Only the indices are shared. No memory is actually shared, but the
- * mechanism is used.
+ * If the seek pointer is positioned just before an entry it
+ * should find the next entry.
*/
- new_op->uses_shared_memory = 1;
- new_op->upcall.req.readdir.refn = orangefs_inode->refn;
- new_op->upcall.req.readdir.max_dirent_count =
- ORANGEFS_MAX_DIRENT_COUNT_READDIR;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: upcall.req.readdir.refn.khandle: %pU\n",
- __func__,
- &new_op->upcall.req.readdir.refn.khandle);
+ if (i % 8)
+ i = i + (8 - i%8)%8;
- new_op->upcall.req.readdir.token = *ptoken;
-
-get_new_buffer_index:
- buffer_index = orangefs_readdir_index_get();
- if (buffer_index < 0) {
- ret = buffer_index;
- gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
- ret);
- goto out_free_op;
+ while (i < part->len) {
+ if (part->len < i + sizeof *len)
+ break;
+ len = (void *)part + offset + i;
+ /*
+ * len is the size of the string itself. padlen is the
+ * total size of the encoded string.
+ */
+ padlen = (sizeof *len + *len + 1) +
+ (8 - (sizeof *len + *len + 1)%8)%8;
+ if (part->len < i + padlen + sizeof *khandle)
+ goto next;
+ s = (void *)part + offset + i + sizeof *len;
+ if (s[*len] != 0)
+ goto next;
+ khandle = (void *)part + offset + i + padlen;
+ if (!dir_emit(ctx, s, *len,
+ orangefs_khandle_to_ino(khandle),
+ DT_UNKNOWN))
+ return 0;
+ i += padlen + sizeof *khandle;
+ i = i + (8 - i%8)%8;
+ BUG_ON(i > part->len);
+ ctx->pos = (ctx->pos & PART_MASK) | i;
+ continue;
+next:
+ i += 8;
}
- new_op->upcall.req.readdir.buf_index = buffer_index;
-
- ret = service_operation(new_op,
- "orangefs_readdir",
- get_interruptible_flag(dentry->d_inode));
+ return 1;
+}
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir downcall status is %d. ret:%d\n",
- new_op->downcall.status,
- ret);
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct dir_context *ctx)
+{
+ struct orangefs_dir_part *part;
+ size_t count;
- orangefs_readdir_index_put(buffer_index);
+ count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
- if (ret == -EAGAIN && op_state_purged(new_op)) {
- /* Client-core indices are invalid after it restarted. */
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: Getting new buffer_index for retry of readdir..\n",
- __func__);
- goto get_new_buffer_index;
+ part = od->part;
+ while (part->next && count) {
+ count--;
+ part = part->next;
}
-
- if (ret == -EIO && op_state_purged(new_op)) {
- gossip_err("%s: Client is down. Aborting readdir call.\n",
- __func__);
- goto out_free_op;
+ /* This means the userspace file offset is invalid. */
+ if (count) {
+ od->error = -EIO;
+ return -EIO;
}
- if (ret < 0 || new_op->downcall.status != 0) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir request failed. Status:%d\n",
- new_op->downcall.status);
- if (ret >= 0)
- ret = new_op->downcall.status;
- goto out_free_op;
+ while (part && part->len) {
+ int r;
+ r = fill_from_part(part, ctx);
+ if (r < 0) {
+ od->error = r;
+ return r;
+ } else if (r == 0) {
+ /* Userspace buffer is full. */
+ break;
+ } else {
+ /*
+ * The part ran out of data. Move to the next
+ * part. */
+ ctx->pos = (ctx->pos & PART_MASK) +
+ (1 << PART_SHIFT);
+ part = part->next;
+ }
}
+ return 0;
+}
- dents_buf = new_op->downcall.trailer_buf;
- if (dents_buf == NULL) {
- gossip_err("Invalid NULL buffer in readdir response\n");
- ret = -ENOMEM;
- goto out_free_op;
+static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
+ int whence)
+{
+ struct orangefs_dir *od = file->private_data;
+ /*
+ * Delete the stored data so userspace sees new directory
+ * entries.
+ */
+ if (!whence && offset < od->end) {
+ struct orangefs_dir_part *part = od->part;
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
}
+ return default_llseek(file, offset, whence);
+}
- bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
- &readdir_response);
- if (bytes_decoded < 0) {
- ret = bytes_decoded;
- gossip_err("Could not decode readdir from buffer %d\n", ret);
- goto out_vfree;
- }
+static int orangefs_dir_iterate(struct file *file,
+ struct dir_context *ctx)
+{
+ struct orangefs_inode_s *oi;
+ struct orangefs_dir *od;
+ struct dentry *dentry;
+ int r;
- if (bytes_decoded != new_op->downcall.trailer_size) {
- gossip_err("orangefs_readdir: # bytes decoded (%ld) "
- "!= trailer size (%ld)\n",
- bytes_decoded,
- (long)new_op->downcall.trailer_size);
- ret = -EINVAL;
- goto out_destroy_handle;
- }
+ dentry = file->f_path.dentry;
+ oi = ORANGEFS_I(dentry->d_inode);
+ od = file->private_data;
- /*
- * orangefs doesn't actually store dot and dot-dot, but
- * we need to have them represented.
- */
- if (pos == 0) {
- ino = get_ino_from_khandle(dentry->d_inode);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \".\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
- pos += 1;
- }
+ if (od->error)
+ return od->error;
- if (pos == 1) {
- ino = get_parent_ino_from_dentry(dentry);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \"..\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
- pos += 1;
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos++;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit_dotdot(file, ctx))
+ return 0;
+ ctx->pos = 1 << PART_SHIFT;
}
/*
- * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
- * to prevent "finding" dot and dot-dot on any iteration
- * other than the first.
+ * The seek position is in the first synthesized part but is not
+ * valid.
*/
- if (ctx->pos == ORANGEFS_ITERATE_NEXT)
- ctx->pos = 0;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: dirent_outcount:%d:\n",
- __func__,
- readdir_response.orangefs_dirent_outcount);
- for (i = ctx->pos;
- i < readdir_response.orangefs_dirent_outcount;
- i++) {
- len = readdir_response.dirent_array[i].d_length;
- current_entry = readdir_response.dirent_array[i].d_name;
- current_ino = orangefs_khandle_to_ino(
- &readdir_response.dirent_array[i].khandle);
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "calling dir_emit for %s with len %d"
- ", ctx->pos %ld\n",
- current_entry,
- len,
- (unsigned long)ctx->pos);
- /*
- * type is unknown. We don't return object type
- * in the dirent_array. This leaves getdents
- * clueless about type.
- */
- ret =
- dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
- if (!ret)
- break;
- ctx->pos++;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld\n",
- __func__,
- lld(ctx->pos));
+ if ((ctx->pos & PART_MASK) == 0)
+ return -EIO;
- }
+ r = 0;
/*
- * we ran all the way through the last batch, set up for
- * getting another batch...
+ * Must read more if the user has sought past what has been read
+ * so far. Stop a user who has sought past the end.
*/
- if (ret) {
- *ptoken = readdir_response.token;
- ctx->pos = ORANGEFS_ITERATE_NEXT;
+ while (od->token != ORANGEFS_ITERATE_END &&
+ ctx->pos > od->end) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ }
+ if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
+ return -EIO;
+
+ /* Then try to fill if there's any left in the buffer. */
+ if (ctx->pos < od->end) {
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
+ if (r)
+ return r;
}
- /*
- * Did we hit the end of the directory?
- */
- if (readdir_response.token == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
- ctx->pos = ORANGEFS_READDIR_END;
+ /* Finally get some more and try to fill. */
+ if (od->token != ORANGEFS_ITERATE_END) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
}
-out_destroy_handle:
- /* kfree(NULL) is safe */
- kfree(readdir_response.dirent_array);
-out_vfree:
- gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
- vfree(dents_buf);
-out_free_op:
- op_release(new_op);
- gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
- return ret;
+ return r;
}
static int orangefs_dir_open(struct inode *inode, struct file *file)
{
- __u64 *ptoken;
-
- file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+ struct orangefs_dir *od;
+ file->private_data = kmalloc(sizeof(struct orangefs_dir),
+ GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
-
- ptoken = file->private_data;
- *ptoken = ORANGEFS_READDIR_START;
+ od = file->private_data;
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
+ od->error = 0;
return 0;
}
static int orangefs_dir_release(struct inode *inode, struct file *file)
{
+ struct orangefs_dir *od = file->private_data;
+ struct orangefs_dir_part *part = od->part;
orangefs_flush_inode(inode);
- kfree(file->private_data);
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ kfree(od);
return 0;
}
-/** ORANGEFS implementation of VFS directory operations */
const struct file_operations orangefs_dir_operations = {
+ .llseek = orangefs_dir_llseek,
.read = generic_read_dir,
- .iterate = orangefs_readdir,
+ .iterate = orangefs_dir_iterate,
.open = orangefs_dir_open,
- .release = orangefs_dir_release,
+ .release = orangefs_dir_release
};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 3b8923f8bf21..163001c95501 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -40,16 +40,6 @@ struct orangefs_mkdir_response {
struct orangefs_object_kref refn;
};
-/*
- * duplication of some system interface structures so that I don't have
- * to allocate extra memory
- */
-struct orangefs_dirent {
- char *d_name;
- int d_length;
- struct orangefs_khandle khandle;
-};
-
struct orangefs_statfs_response {
__s64 block_size;
__s64 blocks_total;
@@ -131,12 +121,16 @@ struct orangefs_downcall_s {
} resp;
};
+/*
+ * The readdir response comes in the trailer. It is followed by the
+ * directory entries as described in dir.c.
+ */
+
struct orangefs_readdir_response_s {
__u64 token;
__u64 directory_version;
__u32 __pad2;
__u32 orangefs_dirent_outcount;
- struct orangefs_dirent *dirent_array;
};
#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index e6bbc8083d77..28f38d813ad2 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -114,7 +114,6 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
struct orangefs_kernel_op_s *new_op = NULL;
- struct iov_iter saved = *iter;
int buffer_index = -1;
ssize_t ret;
@@ -193,7 +192,7 @@ populate_shared_memory:
orangefs_bufmap_put(buffer_index);
buffer_index = -1;
if (type == ORANGEFS_IO_WRITE)
- *iter = saved;
+ iov_iter_revert(iter, total_size);
gossip_debug(GOSSIP_FILE_DEBUG,
"%s:going to repopulate_shared_memory.\n",
__func__);
@@ -475,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
/* Make sure generic_write_checks sees an up to date inode size. */
if (file->f_flags & O_APPEND) {
- rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
if (rc == -ESTALE)
rc = -EIO;
if (rc) {
@@ -693,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
* NOTE: We are only interested in file size here,
* so we set mask accordingly.
*/
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a304bf34b212..9428ea0aac16 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
iattr->ia_size);
/* Ensure that we have a up to date size, so we know if it changed. */
- ret = orangefs_inode_getattr(inode, 0, 1);
+ ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
@@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
if (ret)
goto out;
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
+ if (iattr->ia_valid & ATTR_SIZE) {
ret = orangefs_setattr_size(inode, iattr);
if (ret)
goto out;
@@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
"orangefs_getattr: called on %pd\n",
path->dentry);
- ret = orangefs_inode_getattr(inode, 0, 0);
+ ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
if (ret == 0) {
generic_fillattr(inode, stat);
/* override block size reported to stat */
orangefs_inode = ORANGEFS_I(inode);
stat->blksize = orangefs_inode->blksize;
+
+ if (request_mask & STATX_SIZE)
+ stat->result_mask = STATX_BASIC_STATS;
+ else
+ stat->result_mask = STATX_BASIC_STATS &
+ ~STATX_SIZE;
}
return ret;
}
@@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask)
gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
/* Make sure the permission (and other common attrs) are up to date. */
- ret = orangefs_inode_getattr(inode, 0, 0);
+ ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
if (ret < 0)
return ret;
@@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
if (!inode || !(inode->i_state & I_NEW))
return inode;
- error = orangefs_inode_getattr(inode, 1, 1);
+ error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
- error = orangefs_inode_getattr(inode, 1, 1);
+ error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
if (error)
goto out_iput;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index a290ff6ec756..478e88bd7f9d 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir,
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %pd\n",
@@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-
gossip_debug(GOSSIP_NAME_DEBUG,
"%s:%s:%d "
"Found good inode [%lu] with count [%d]\n",
@@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir,
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %pd\n",
@@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %pd\n",
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 6333cbbdfef7..83b506020718 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
size_t n = size;
if (n > PAGE_SIZE)
n = PAGE_SIZE;
- n = copy_page_from_iter(page, 0, n, iter);
- if (!n)
+ if (copy_page_from_iter(page, 0, n, iter) != n)
return -EFAULT;
size -= n;
}
return 0;
-
}
/*
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 791912da97d7..716ed337f166 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file,
"orangefs_debug_write: %pD\n",
file);
+ if (count == 0)
+ return 0;
+
/*
* Thwart users who try to jamb a ridiculous number
* of bytes into the debug file...
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index f380f9ed1b28..efe08c763e56 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -52,12 +52,7 @@
*/
#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800
-/*
- * The maximum number of directory entries in a single request is 96.
- * XXX: Why can this not be higher. The client-side code can handle up to 512.
- * XXX: What happens if we expect more than the client can return?
- */
-#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512
#include "upcall.h"
#include "downcall.h"
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 8afac46fcc87..ea0ce507a6ab 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -215,6 +215,7 @@ struct orangefs_inode_s {
unsigned long pinode_flags;
unsigned long getattr_time;
+ u32 getattr_mask;
};
#define P_ATIME_FLAG 0
@@ -340,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
return &(ORANGEFS_I(inode)->refn.khandle);
}
-static inline __s32 get_fsid_from_ino(struct inode *inode)
-{
- return ORANGEFS_I(inode)->refn.fs_id;
-}
-
static inline ino_t get_ino_from_khandle(struct inode *inode)
{
struct orangefs_khandle *khandle;
@@ -500,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode,
size_t size,
int flags);
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+ u32 request_mask);
int orangefs_inode_check_changed(struct inode *inode);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 9b96b99539d6..aab6f1842963 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
return 0;
}
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+ u32 request_mask)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
get_khandle_from_ino(inode));
if (!new && !bypass) {
- if (time_before(jiffies, orangefs_inode->getattr_time))
+ /*
+ * Must have all the attributes in the mask and be within cache
+ * time.
+ */
+ if ((request_mask & orangefs_inode->getattr_mask) ==
+ request_mask &&
+ time_before(jiffies, orangefs_inode->getattr_time))
return 0;
}
@@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
if (!new_op)
return -ENOMEM;
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
- new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+ /*
+ * Size is the hardest attribute to get. The incremental cost of any
+ * other attribute is essentially zero.
+ */
+ if (request_mask & STATX_SIZE || new)
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+ else
+ new_op->upcall.req.getattr.mask =
+ ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE;
ret = service_operation(new_op, __func__,
get_interruptible_flag(inode));
@@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- inode_size = (loff_t)new_op->
- downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
- inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
- spin_lock(&inode->i_lock);
- inode->i_bytes = inode_size;
- inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
- spin_unlock(&inode->i_lock);
+ if (request_mask & STATX_SIZE || new) {
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
+ }
break;
case S_IFDIR:
- inode->i_size = PAGE_SIZE;
- orangefs_inode->blksize = i_blocksize(inode);
- spin_lock(&inode->i_lock);
- inode_set_bytes(inode, inode->i_size);
- spin_unlock(&inode->i_lock);
+ if (request_mask & STATX_SIZE || new) {
+ inode->i_size = PAGE_SIZE;
+ orangefs_inode->blksize = i_blocksize(inode);
+ spin_lock(&inode->i_lock);
+ inode_set_bytes(inode, inode->i_size);
+ spin_unlock(&inode->i_lock);
+ }
set_nlink(inode, 1);
break;
case S_IFLNK:
@@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
orangefs_inode->getattr_time = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
+ if (request_mask & STATX_SIZE || new)
+ orangefs_inode->getattr_mask = STATX_BASIC_STATS;
+ else
+ orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE;
ret = 0;
out:
op_release(new_op);
@@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode)
return ret;
}
-int orangefs_unmount_sb(struct super_block *sb)
-{
- int ret = -EINVAL;
- struct orangefs_kernel_op_s *new_op = NULL;
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_unmount_sb called on sb %p\n",
- sb);
-
- new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
- if (!new_op)
- return -ENOMEM;
- new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
- new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
- strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
- ORANGEFS_SB(sb)->devname,
- ORANGEFS_MAX_SERVER_ADDR_LEN);
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "Attempting ORANGEFS Unmount via host %s\n",
- new_op->upcall.req.fs_umount.orangefs_config_server);
-
- ret = service_operation(new_op, "orangefs_fs_umount", 0);
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_unmount: got return value of %d\n", ret);
- if (ret)
- sb = ERR_PTR(ret);
- else
- ORANGEFS_SB(sb)->mount_pending = 1;
-
- op_release(new_op);
- return ret;
-}
-
void orangefs_make_bad_inode(struct inode *inode)
{
if (is_root_handle(inode)) {
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 971307ad69be..48bcc1bbe415 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset;
#define ORANGEFS_G_SGID (1 << 10)
#define ORANGEFS_U_SUID (1 << 11)
-/* definition taken from stdint.h */
-#define INT32_MAX (2147483647)
-#define ORANGEFS_ITERATE_START (INT32_MAX - 1)
-#define ORANGEFS_ITERATE_END (INT32_MAX - 2)
-#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3)
-#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
-#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END
+#define ORANGEFS_ITERATE_START 2147483646
+#define ORANGEFS_ITERATE_END 2147483645
#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
#define ORANGEFS_APPEND_FL FS_APPEND_FL
#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 629d8c917fa6..5c7c273e17ec 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = {
.fh_to_dentry = orangefs_fh_to_dentry,
};
+static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
+{
+ struct orangefs_kernel_op_s *op;
+ int r;
+ op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+ if (!op)
+ return -ENOMEM;
+ op->upcall.req.fs_umount.id = id;
+ op->upcall.req.fs_umount.fs_id = fs_id;
+ strncpy(op->upcall.req.fs_umount.orangefs_config_server,
+ devname, ORANGEFS_MAX_SERVER_ADDR_LEN);
+ r = service_operation(op, "orangefs_fs_umount", 0);
+ /* Not much to do about an error here. */
+ if (r)
+ gossip_err("orangefs_unmount: service_operation %d\n", r);
+ op_release(op);
+ return r;
+}
+
static int orangefs_fill_sb(struct super_block *sb,
struct orangefs_fs_mount_response *fs_mount,
void *data, int silent)
@@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (IS_ERR(sb)) {
d = ERR_CAST(sb);
+ orangefs_unmount(new_op->downcall.resp.fs_mount.id,
+ new_op->downcall.resp.fs_mount.fs_id, devname);
goto free_op;
}
@@ -539,6 +560,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
free_sb_and_op:
/* Will call orangefs_kill_sb with sb not in list. */
ORANGEFS_SB(sb)->no_list = 1;
+ /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */
deactivate_locked_super(sb);
free_op:
gossip_err("orangefs_mount: mount request failed with %d\n", ret);
@@ -554,6 +576,7 @@ free_op:
void orangefs_kill_sb(struct super_block *sb)
{
+ int r;
gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
/* provided sb cleanup */
@@ -563,7 +586,10 @@ void orangefs_kill_sb(struct super_block *sb)
* issue the unmount to userspace to tell it to remove the
* dynamic mount info it has for this superblock
*/
- orangefs_unmount_sb(sb);
+ r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id,
+ ORANGEFS_SB(sb)->devname);
+ if (!r)
+ ORANGEFS_SB(sb)->mount_pending = 1;
if (!ORANGEFS_SB(sb)->no_list) {
/* remove the sb from our list of orangefs specific sb's */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index abcfa3fa9992..61e2ca7fec55 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -124,7 +124,14 @@ retry_servicing:
gossip_debug(GOSSIP_WAIT_DEBUG,
"%s:client core is NOT in service.\n",
__func__);
- timeout = op_timeout_secs * HZ;
+ /*
+ * Don't wait for the userspace component to return if
+ * the filesystem is being umounted anyway.
+ */
+ if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT)
+ timeout = 0;
+ else
+ timeout = op_timeout_secs * HZ;
}
spin_unlock(&orangefs_request_list_lock);
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 74a81b1daaac..237c9c04dc3b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err("Invalid key length (%d)\n",
- (int)strlen(name));
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
return -EINVAL;
- }
fsuid = from_kuid(&init_user_ns, current_fsuid());
fsgid = from_kgid(&init_user_ns, current_fsgid());
@@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
struct orangefs_kernel_op_s *new_op = NULL;
int ret = -ENOMEM;
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
+ return -EINVAL;
+
down_write(&orangefs_inode->xattr_sem);
new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
if (!new_op)
@@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,
"%s: name %s, buffer_size %zd\n",
__func__, name, size);
- if (size >= ORANGEFS_MAX_XATTR_VALUELEN ||
- flags < 0) {
- gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
- (int)size,
- flags);
+ if (size > ORANGEFS_MAX_XATTR_VALUELEN)
+ return -EINVAL;
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
return -EINVAL;
- }
internal_flag = convert_to_internal_xattr_flags(flags);
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err
- ("orangefs_inode_setxattr: bogus key size (%d)\n",
- (int)(strlen(name)));
- return -EINVAL;
- }
-
/* This is equivalent to a removexattr */
if (size == 0 && value == NULL) {
gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -358,7 +348,7 @@ try_again:
returned_count = new_op->downcall.resp.listxattr.returned_count;
if (returned_count < 0 ||
- returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+ returned_count > ORANGEFS_MAX_XATTR_LISTLEN) {
gossip_err("%s: impossible value for returned_count:%d:\n",
__func__,
returned_count);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 906ea6c93260..9008ab9fbd2e 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -20,6 +20,7 @@
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
#include "ovl_entry.h"
@@ -232,6 +233,79 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
return err;
}
+static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_be *uuid)
+{
+ struct ovl_fh *fh;
+ int fh_type, fh_len, dwords;
+ void *buf;
+ int buflen = MAX_HANDLE_SZ;
+
+ buf = kmalloc(buflen, GFP_TEMPORARY);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * We encode a non-connectable file handle for non-dir, because we
+ * only need to find the lower inode number and we don't want to pay
+ * the price or reconnecting the dentry.
+ */
+ dwords = buflen >> 2;
+ fh_type = exportfs_encode_fh(lower, buf, &dwords, 0);
+ buflen = (dwords << 2);
+
+ fh = ERR_PTR(-EIO);
+ if (WARN_ON(fh_type < 0) ||
+ WARN_ON(buflen > MAX_HANDLE_SZ) ||
+ WARN_ON(fh_type == FILEID_INVALID))
+ goto out;
+
+ BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
+ fh_len = offsetof(struct ovl_fh, fid) + buflen;
+ fh = kmalloc(fh_len, GFP_KERNEL);
+ if (!fh) {
+ fh = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fh->version = OVL_FH_VERSION;
+ fh->magic = OVL_FH_MAGIC;
+ fh->type = fh_type;
+ fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
+ fh->len = fh_len;
+ fh->uuid = *uuid;
+ memcpy(fh->fid, buf, buflen);
+
+out:
+ kfree(buf);
+ return fh;
+}
+
+static int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
+ struct dentry *upper)
+{
+ struct super_block *sb = lower->d_sb;
+ uuid_be *uuid = (uuid_be *) &sb->s_uuid;
+ const struct ovl_fh *fh = NULL;
+ int err;
+
+ /*
+ * When lower layer doesn't support export operations store a 'null' fh,
+ * so we can use the overlay.origin xattr to distignuish between a copy
+ * up and a pure upper inode.
+ */
+ if (sb->s_export_op && sb->s_export_op->fh_to_dentry &&
+ uuid_be_cmp(*uuid, NULL_UUID_BE)) {
+ fh = ovl_encode_fh(lower, uuid);
+ if (IS_ERR(fh))
+ return PTR_ERR(fh);
+ }
+
+ err = ovl_do_setxattr(upper, OVL_XATTR_ORIGIN, fh, fh ? fh->len : 0, 0);
+ kfree(fh);
+
+ return err;
+}
+
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link,
@@ -316,6 +390,14 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (err)
goto out_cleanup;
+ /*
+ * Store identifier of lower inode in upper inode xattr to
+ * allow lookup of the copy up origin inode.
+ */
+ err = ovl_set_origin(dentry, lowerpath->dentry, temp);
+ if (err)
+ goto out_cleanup;
+
if (tmpfile)
err = ovl_do_link(temp, udir, upper, true);
else
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6515796460df..723b98b90698 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,36 +138,6 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
return err;
}
-static int ovl_dir_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- struct dentry *dentry = path->dentry;
- int err;
- enum ovl_path_type type;
- struct path realpath;
- const struct cred *old_cred;
-
- type = ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getattr(&realpath, stat, request_mask, flags);
- revert_creds(old_cred);
- if (err)
- return err;
-
- stat->dev = dentry->d_sb->s_dev;
- stat->ino = dentry->d_inode->i_ino;
-
- /*
- * It's probably not worth it to count subdirs to get the
- * correct link count. nlink=1 seems to pacify 'find' and
- * other utilities.
- */
- if (OVL_TYPE_MERGE(type))
- stat->nlink = 1;
-
- return 0;
-}
-
/* Common operations required to be done after creation of file on upper */
static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry, bool hardlink)
@@ -182,6 +152,9 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
inc_nlink(inode);
}
d_instantiate(dentry, inode);
+ /* Force lookup of new upper hardlink to find its lower */
+ if (hardlink)
+ d_drop(dentry);
}
static bool ovl_type_merge(struct dentry *dentry)
@@ -210,7 +183,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput;
- if (ovl_type_merge(dentry->d_parent)) {
+ if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) {
/* Setting opaque here is just an optimization, allow to fail */
ovl_set_opaque(dentry, newdentry);
}
@@ -1070,7 +1043,7 @@ const struct inode_operations ovl_dir_inode_operations = {
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
- .getattr = ovl_dir_getattr,
+ .getattr = ovl_getattr,
.listxattr = ovl_listxattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index f8fe6bf2036d..ad9547f82da5 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -57,18 +57,78 @@ out:
return err;
}
-static int ovl_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
+int ovl_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
+ enum ovl_path_type type;
struct path realpath;
const struct cred *old_cred;
+ bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
int err;
- ovl_path_real(dentry, &realpath);
+ type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getattr(&realpath, stat, request_mask, flags);
+ if (err)
+ goto out;
+
+ /*
+ * When all layers are on the same fs, all real inode number are
+ * unique, so we use the overlay st_dev, which is friendly to du -x.
+ *
+ * We also use st_ino of the copy up origin, if we know it.
+ * This guaranties constant st_dev/st_ino across copy up.
+ *
+ * If filesystem supports NFS export ops, this also guaranties
+ * persistent st_ino across mount cycle.
+ */
+ if (ovl_same_sb(dentry->d_sb)) {
+ if (OVL_TYPE_ORIGIN(type)) {
+ struct kstat lowerstat;
+ u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
+
+ ovl_path_lower(dentry, &realpath);
+ err = vfs_getattr(&realpath, &lowerstat,
+ lowermask, flags);
+ if (err)
+ goto out;
+
+ WARN_ON_ONCE(stat->dev != lowerstat.dev);
+ /*
+ * Lower hardlinks are broken on copy up to different
+ * upper files, so we cannot use the lower origin st_ino
+ * for those different files, even for the same fs case.
+ */
+ if (is_dir || lowerstat.nlink == 1)
+ stat->ino = lowerstat.ino;
+ }
+ stat->dev = dentry->d_sb->s_dev;
+ } else if (is_dir) {
+ /*
+ * If not all layers are on the same fs the pair {real st_ino;
+ * overlay st_dev} is not unique, so use the non persistent
+ * overlay st_ino.
+ *
+ * Always use the overlay st_dev for directories, so 'find
+ * -xdev' will scan the entire overlay mount and won't cross the
+ * overlay mount boundaries.
+ */
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+ }
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (is_dir && OVL_TYPE_MERGE(type))
+ stat->nlink = 1;
+
+out:
revert_creds(old_cred);
+
return err;
}
@@ -303,6 +363,41 @@ static const struct inode_operations ovl_symlink_inode_operations = {
.update_time = ovl_update_time,
};
+/*
+ * It is possible to stack overlayfs instance on top of another
+ * overlayfs instance as lower layer. We need to annonate the
+ * stackable i_mutex locks according to stack level of the super
+ * block instance. An overlayfs instance can never be in stack
+ * depth 0 (there is always a real fs below it). An overlayfs
+ * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
+ *
+ * For example, here is a snip from /proc/lockdep_chains after
+ * dir_iterate of nested overlayfs:
+ *
+ * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
+ * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
+ * [...] &type->i_mutex_dir_key (stack_depth=0)
+ */
+#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
+
+static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
+ static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+
+ int depth = inode->i_sb->s_stack_depth - 1;
+
+ if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
+ depth = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
+ else
+ lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+#endif
+}
+
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_ino = get_next_ino();
@@ -312,6 +407,8 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
#endif
+ ovl_lockdep_annotate_inode_mutex_key(inode);
+
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_op = &ovl_file_inode_operations;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index b8b077821fb0..bad0f665a635 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -12,6 +12,8 @@
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/ratelimit.h>
+#include <linux/mount.h>
+#include <linux/exportfs.h>
#include "overlayfs.h"
#include "ovl_entry.h"
@@ -81,6 +83,90 @@ invalid:
goto err_free;
}
+static int ovl_acceptable(void *ctx, struct dentry *dentry)
+{
+ return 1;
+}
+
+static struct dentry *ovl_get_origin(struct dentry *dentry,
+ struct vfsmount *mnt)
+{
+ int res;
+ struct ovl_fh *fh = NULL;
+ struct dentry *origin = NULL;
+ int bytes;
+
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+ if (res < 0) {
+ if (res == -ENODATA || res == -EOPNOTSUPP)
+ return NULL;
+ goto fail;
+ }
+ /* Zero size value means "copied up but origin unknown" */
+ if (res == 0)
+ return NULL;
+
+ fh = kzalloc(res, GFP_TEMPORARY);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res);
+ if (res < 0)
+ goto fail;
+
+ if (res < sizeof(struct ovl_fh) || res < fh->len)
+ goto invalid;
+
+ if (fh->magic != OVL_FH_MAGIC)
+ goto invalid;
+
+ /* Treat larger version and unknown flags as "origin unknown" */
+ if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL)
+ goto out;
+
+ /* Treat endianness mismatch as "origin unknown" */
+ if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
+ (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
+ goto out;
+
+ bytes = (fh->len - offsetof(struct ovl_fh, fid));
+
+ /*
+ * Make sure that the stored uuid matches the uuid of the lower
+ * layer where file handle will be decoded.
+ */
+ if (uuid_be_cmp(fh->uuid, *(uuid_be *) &mnt->mnt_sb->s_uuid))
+ goto out;
+
+ origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid,
+ bytes >> 2, (int)fh->type,
+ ovl_acceptable, NULL);
+ if (IS_ERR(origin)) {
+ /* Treat stale file handle as "origin unknown" */
+ if (origin == ERR_PTR(-ESTALE))
+ origin = NULL;
+ goto out;
+ }
+
+ if (ovl_dentry_weird(origin) ||
+ ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) {
+ dput(origin);
+ origin = NULL;
+ goto invalid;
+ }
+
+out:
+ kfree(fh);
+ return origin;
+
+fail:
+ pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res);
+ goto out;
+invalid:
+ pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh);
+ goto out;
+}
+
static bool ovl_is_opaquedir(struct dentry *dentry)
{
int res;
@@ -192,6 +278,45 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
return 0;
}
+
+static int ovl_check_origin(struct dentry *dentry, struct dentry *upperdentry,
+ struct path **stackp, unsigned int *ctrp)
+{
+ struct super_block *same_sb = ovl_same_sb(dentry->d_sb);
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
+ struct vfsmount *mnt;
+ struct dentry *origin;
+
+ if (!same_sb || !roe->numlower)
+ return 0;
+
+ /*
+ * Since all layers are on the same fs, we use the first layer for
+ * decoding the file handle. We may get a disconnected dentry,
+ * which is fine, because we only need to hold the origin inode in
+ * cache and use its inode number. We may even get a connected dentry,
+ * that is not under the first layer's root. That is also fine for
+ * using it's inode number - it's the same as if we held a reference
+ * to a dentry in first layer that was moved under us.
+ */
+ mnt = roe->lowerstack[0].mnt;
+
+ origin = ovl_get_origin(upperdentry, mnt);
+ if (IS_ERR_OR_NULL(origin))
+ return PTR_ERR(origin);
+
+ BUG_ON(*stackp || *ctrp);
+ *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY);
+ if (!*stackp) {
+ dput(origin);
+ return -ENOMEM;
+ }
+ **stackp = (struct path) { .dentry = origin, .mnt = mnt };
+ *ctrp = 1;
+
+ return 0;
+}
+
/*
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
@@ -220,6 +345,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
const struct cred *old_cred;
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
struct path *stack = NULL;
struct dentry *upperdir, *upperdentry = NULL;
unsigned int ctr = 0;
@@ -253,13 +379,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
err = -EREMOTE;
goto out;
}
+ if (upperdentry && !d.is_dir) {
+ BUG_ON(!d.stop || d.redirect);
+ err = ovl_check_origin(dentry, upperdentry,
+ &stack, &ctr);
+ if (err)
+ goto out;
+ }
if (d.redirect) {
upperredirect = kstrdup(d.redirect, GFP_KERNEL);
if (!upperredirect)
goto out_put_upper;
if (d.redirect[0] == '/')
- poe = dentry->d_sb->s_root->d_fsdata;
+ poe = roe;
}
upperopaque = d.opaque;
}
@@ -290,10 +423,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.stop)
break;
- if (d.redirect &&
- d.redirect[0] == '/' &&
- poe != dentry->d_sb->s_root->d_fsdata) {
- poe = dentry->d_sb->s_root->d_fsdata;
+ if (d.redirect && d.redirect[0] == '/' && poe != roe) {
+ poe = roe;
/* Find the current layer on the root dentry */
for (i = 0; i < poe->numlower; i++)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 741dc0b6931f..caa36cb9c46d 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,18 +8,56 @@
*/
#include <linux/kernel.h>
+#include <linux/uuid.h>
enum ovl_path_type {
__OVL_PATH_UPPER = (1 << 0),
__OVL_PATH_MERGE = (1 << 1),
+ __OVL_PATH_ORIGIN = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
+#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin"
+
+/*
+ * The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
+ * where:
+ * origin.fh - exported file handle of the lower file
+ * origin.uuid - uuid of the lower filesystem
+ */
+#define OVL_FH_VERSION 0
+#define OVL_FH_MAGIC 0xfb
+
+/* CPU byte order required for fid decoding: */
+#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0)
+#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1)
+
+#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN)
+
+#if defined(__LITTLE_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN 0
+#elif defined(__BIG_ENDIAN)
+#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
+#else
+#error Endianness not defined
+#endif
+
+/* On-disk and in-memeory format for redirect by file handle */
+struct ovl_fh {
+ u8 version; /* 0 */
+ u8 magic; /* 0xfb */
+ u8 len; /* size of this header + size of fid */
+ u8 flags; /* OVL_FH_FLAG_* */
+ u8 type; /* fid_type of fid */
+ uuid_be uuid; /* uuid of filesystem */
+ u8 fid[0]; /* file identifier */
+} __packed;
#define OVL_ISUPPER_MASK 1UL
@@ -151,6 +189,7 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+struct super_block *ovl_same_sb(struct super_block *sb);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
bool ovl_dentry_remote(struct dentry *dentry);
bool ovl_dentry_weird(struct dentry *dentry);
@@ -197,6 +236,8 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
int ovl_permission(struct inode *inode, int mask);
int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 59614faa14c3..b2023ddb8532 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -29,6 +29,8 @@ struct ovl_fs {
const struct cred *creator_cred;
bool tmpfile;
wait_queue_head_t copyup_wq;
+ /* sb common to all layers */
+ struct super_block *same_sb;
};
/* private information held for every overlayfs dentry */
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index c9e70d39c1ea..9828b7de8999 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -49,11 +49,28 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static int ovl_check_append_only(struct inode *inode, int flag)
+{
+ /*
+ * This test was moot in vfs may_open() because overlay inode does
+ * not have the S_APPEND flag, so re-check on real upper inode
+ */
+ if (IS_APPEND(inode)) {
+ if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
+ return -EPERM;
+ if (flag & O_TRUNC)
+ return -EPERM;
+ }
+
+ return 0;
+}
+
static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode,
unsigned int open_flags)
{
struct dentry *real;
+ int err;
if (!d_is_reg(dentry)) {
if (!inode || inode == d_inode(dentry))
@@ -65,15 +82,20 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
return dentry;
if (open_flags) {
- int err = ovl_open_maybe_copy_up(dentry, open_flags);
-
+ err = ovl_open_maybe_copy_up(dentry, open_flags);
if (err)
return ERR_PTR(err);
}
real = ovl_dentry_upper(dentry);
- if (real && (!inode || inode == d_inode(real)))
+ if (real && (!inode || inode == d_inode(real))) {
+ if (!inode) {
+ err = ovl_check_append_only(d_inode(real), open_flags);
+ if (err)
+ return ERR_PTR(err);
+ }
return real;
+ }
real = ovl_dentry_lower(dentry);
if (!real)
@@ -709,8 +731,8 @@ static const struct xattr_handler *ovl_xattr_handlers[] = {
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct path upperpath = { NULL, NULL };
- struct path workpath = { NULL, NULL };
+ struct path upperpath = { };
+ struct path workpath = { };
struct dentry *root_dentry;
struct inode *realinode;
struct ovl_entry *oe;
@@ -892,11 +914,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ufs->lower_mnt[ufs->numlower] = mnt;
ufs->numlower++;
+
+ /* Check if all lower layers are on same sb */
+ if (i == 0)
+ ufs->same_sb = mnt->mnt_sb;
+ else if (ufs->same_sb != mnt->mnt_sb)
+ ufs->same_sb = NULL;
}
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
+ else if (ufs->upper_mnt->mnt_sb != ufs->same_sb)
+ ufs->same_sb = NULL;
if (remote)
sb->s_d_op = &ovl_reval_dentry_operations;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 6e610a205e15..cfdea47313a1 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -40,6 +40,13 @@ const struct cred *ovl_override_creds(struct super_block *sb)
return override_creds(ofs->creator_cred);
}
+struct super_block *ovl_same_sb(struct super_block *sb)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ return ofs->same_sb;
+}
+
struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
@@ -75,11 +82,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
type = __OVL_PATH_UPPER;
/*
- * Non-dir dentry can hold lower dentry from previous
- * location.
+ * Non-dir dentry can hold lower dentry of its copy up origin.
*/
- if (oe->numlower && d_is_dir(dentry))
- type |= __OVL_PATH_MERGE;
+ if (oe->numlower) {
+ type |= __OVL_PATH_ORIGIN;
+ if (d_is_dir(dentry))
+ type |= __OVL_PATH_MERGE;
+ }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -100,7 +109,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path)
{
struct ovl_entry *oe = dentry->d_fsdata;
- *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+ *path = oe->numlower ? oe->lowerstack[0] : (struct path) { };
}
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c87b6b9a8a76..45f6bf68fff3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -821,10 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- /* Maybe we should limit FOLL_FORCE to actual ptrace users? */
- flags = FOLL_FORCE;
- if (write)
- flags |= FOLL_WRITE;
+ flags = write ? FOLL_WRITE : 0;
while (count > 0) {
int this_len = min_t(int, count, PAGE_SIZE);
@@ -2834,6 +2831,15 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
return err;
}
+#ifdef CONFIG_LIVEPATCH
+static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ seq_printf(m, "%d\n", task->patch_state);
+ return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
/*
* Thread groups
*/
@@ -2933,6 +2939,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("timers", S_IRUGO, proc_timers_operations),
#endif
REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_LIVEPATCH
+ ONE("patch_state", S_IRUSR, proc_pid_patch_state),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3315,6 +3324,9 @@ static const struct pid_entry tid_base_stuff[] = {
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
+#ifdef CONFIG_LIVEPATCH
+ ONE("patch_state", S_IRUSR, proc_pid_patch_state),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index ee27feb34cf4..9425c0d97262 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->data = NULL;
ent->proc_fops = NULL;
ent->proc_iops = NULL;
+ parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
parent->nlink--;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2cc7a8030275..e250910cffc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
struct proc_inode *ei;
struct inode *inode;
- ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
#endif
#ifdef CONFIG_PID_NS
&pidns_operations,
+ &pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
&userns_operations,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d04ea4349909..67985a7233c2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr
*pentry = entry;
}
-void register_sysctl_root(struct ctl_table_root *root)
-{
-}
-
/*
* sysctl_perm does NOT grant the superuser all rights automatically, because
* some sysctl variables are readonly even to root.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 312578089544..f0c8b33d99b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -441,6 +441,7 @@ struct mem_size_stats {
unsigned long private_dirty;
unsigned long referenced;
unsigned long anonymous;
+ unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
- if (PageAnon(page))
+ if (PageAnon(page)) {
mss->anonymous += size;
+ if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ mss->lazyfree += size;
+ }
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
+ "LazyFree: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"ShmemPmdMapped: %8lu kB\n"
"Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.private_dirty >> 10,
mss.referenced >> 10,
mss.anonymous >> 10,
+ mss.lazyfree >> 10,
mss.anonymous_thp >> 10,
mss.shmem_thp >> 10,
mss.shared_hugetlb >> 10,
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 899d0ba0bd6c..06aab07b6bb7 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip,
{
unsigned long flags;
struct pstore_ftrace_record rec = {};
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_FTRACE,
+ .buf = (char *)&rec,
+ .size = sizeof(rec),
+ .psi = psinfo,
+ };
if (unlikely(oops_in_progress))
return;
@@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.parent_ip = parent_ip;
pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
- psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
- 0, sizeof(rec), psinfo);
+ psinfo->write(&record);
local_irq_restore(flags);
}
@@ -117,7 +122,7 @@ void pstore_register_ftrace(void)
{
struct dentry *file;
- if (!psinfo->write_buf)
+ if (!psinfo->write)
return;
pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 57c0646479f5..792a4e5f9226 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -47,12 +47,8 @@ static LIST_HEAD(allpstore);
struct pstore_private {
struct list_head list;
- struct pstore_info *psi;
- enum pstore_type_id type;
- u64 id;
- int count;
- ssize_t size;
- char data[];
+ struct pstore_record *record;
+ size_t total_size;
};
struct pstore_ftrace_seq_data {
@@ -63,6 +59,17 @@ struct pstore_ftrace_seq_data {
#define REC_SIZE sizeof(struct pstore_ftrace_record)
+static void free_pstore_private(struct pstore_private *private)
+{
+ if (!private)
+ return;
+ if (private->record) {
+ kfree(private->record->buf);
+ kfree(private->record);
+ }
+ kfree(private);
+}
+
static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
{
struct pstore_private *ps = s->private;
@@ -72,9 +79,9 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
if (!data)
return NULL;
- data->off = ps->size % REC_SIZE;
+ data->off = ps->total_size % REC_SIZE;
data->off += *pos * REC_SIZE;
- if (data->off + REC_SIZE > ps->size) {
+ if (data->off + REC_SIZE > ps->total_size) {
kfree(data);
return NULL;
}
@@ -94,7 +101,7 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
struct pstore_ftrace_seq_data *data = v;
data->off += REC_SIZE;
- if (data->off + REC_SIZE > ps->size)
+ if (data->off + REC_SIZE > ps->total_size)
return NULL;
(*pos)++;
@@ -105,7 +112,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
{
struct pstore_private *ps = s->private;
struct pstore_ftrace_seq_data *data = v;
- struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
+ struct pstore_ftrace_record *rec;
+
+ rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
pstore_ftrace_decode_cpu(rec),
@@ -125,7 +134,7 @@ static const struct seq_operations pstore_ftrace_seq_ops = {
static int pstore_check_syslog_permissions(struct pstore_private *ps)
{
- switch (ps->type) {
+ switch (ps->record->type) {
case PSTORE_TYPE_DMESG:
case PSTORE_TYPE_CONSOLE:
return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
@@ -141,9 +150,10 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
struct seq_file *sf = file->private_data;
struct pstore_private *ps = sf->private;
- if (ps->type == PSTORE_TYPE_FTRACE)
+ if (ps->record->type == PSTORE_TYPE_FTRACE)
return seq_read(file, userbuf, count, ppos);
- return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+ return simple_read_from_buffer(userbuf, count, ppos,
+ ps->record->buf, ps->total_size);
}
static int pstore_file_open(struct inode *inode, struct file *file)
@@ -157,7 +167,7 @@ static int pstore_file_open(struct inode *inode, struct file *file)
if (err)
return err;
- if (ps->type == PSTORE_TYPE_FTRACE)
+ if (ps->record->type == PSTORE_TYPE_FTRACE)
sops = &pstore_ftrace_seq_ops;
err = seq_open(file, sops);
@@ -193,20 +203,19 @@ static const struct file_operations pstore_file_operations = {
static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = d_inode(dentry)->i_private;
+ struct pstore_record *record = p->record;
int err;
err = pstore_check_syslog_permissions(p);
if (err)
return err;
- if (p->psi->erase) {
- mutex_lock(&p->psi->read_mutex);
- p->psi->erase(p->type, p->id, p->count,
- d_inode(dentry)->i_ctime, p->psi);
- mutex_unlock(&p->psi->read_mutex);
- } else {
+ if (!record->psi->erase)
return -EPERM;
- }
+
+ mutex_lock(&record->psi->read_mutex);
+ record->psi->erase(record);
+ mutex_unlock(&record->psi->read_mutex);
return simple_unlink(dir, dentry);
}
@@ -221,7 +230,7 @@ static void pstore_evict_inode(struct inode *inode)
spin_lock_irqsave(&allpstore_lock, flags);
list_del(&p->list);
spin_unlock_irqrestore(&allpstore_lock, flags);
- kfree(p);
+ free_pstore_private(p);
}
}
@@ -302,23 +311,23 @@ bool pstore_is_mounted(void)
* Load it up with "size" bytes of data from "buf".
* Set the mtime & ctime to the date that this record was originally stored.
*/
-int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
- char *data, bool compressed, size_t size,
- struct timespec time, struct pstore_info *psi)
+int pstore_mkfile(struct dentry *root, struct pstore_record *record)
{
- struct dentry *root = pstore_sb->s_root;
struct dentry *dentry;
struct inode *inode;
int rc = 0;
char name[PSTORE_NAMELEN];
struct pstore_private *private, *pos;
unsigned long flags;
+ size_t size = record->size + record->ecc_notice_size;
+
+ WARN_ON(!inode_is_locked(d_inode(root)));
spin_lock_irqsave(&allpstore_lock, flags);
list_for_each_entry(pos, &allpstore, list) {
- if (pos->type == type &&
- pos->id == id &&
- pos->psi == psi) {
+ if (pos->record->type == record->type &&
+ pos->record->id == record->id &&
+ pos->record->psi == record->psi) {
rc = -EEXIST;
break;
}
@@ -328,72 +337,74 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
return rc;
rc = -ENOMEM;
- inode = pstore_get_inode(pstore_sb);
+ inode = pstore_get_inode(root->d_sb);
if (!inode)
goto fail;
inode->i_mode = S_IFREG | 0444;
inode->i_fop = &pstore_file_operations;
- private = kmalloc(sizeof *private + size, GFP_KERNEL);
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
if (!private)
goto fail_alloc;
- private->type = type;
- private->id = id;
- private->count = count;
- private->psi = psi;
+ private->record = record;
- switch (type) {
+ switch (record->type) {
case PSTORE_TYPE_DMESG:
scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
- psname, id, compressed ? ".enc.z" : "");
+ record->psi->name, record->id,
+ record->compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "console-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_FTRACE:
- scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "ftrace-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_MCE:
- scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "mce-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_RTAS:
- scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "rtas-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_OF:
scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
- psname, id);
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_COMMON:
scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
- psname, id);
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PMSG:
- scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "pmsg-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_OPAL:
- sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_UNKNOWN:
- scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "unknown-%s-%lld",
+ record->psi->name, record->id);
break;
default:
scnprintf(name, sizeof(name), "type%d-%s-%lld",
- type, psname, id);
+ record->type, record->psi->name, record->id);
break;
}
- inode_lock(d_inode(root));
-
dentry = d_alloc_name(root, name);
if (!dentry)
- goto fail_lockedalloc;
+ goto fail_private;
- memcpy(private->data, data, size);
- inode->i_size = private->size = size;
+ inode->i_size = private->total_size = size;
inode->i_private = private;
- if (time.tv_sec)
- inode->i_mtime = inode->i_ctime = time;
+ if (record->time.tv_sec)
+ inode->i_mtime = inode->i_ctime = record->time;
d_add(dentry, inode);
@@ -401,13 +412,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
list_add(&private->list, &allpstore);
spin_unlock_irqrestore(&allpstore_lock, flags);
- inode_unlock(d_inode(root));
-
return 0;
-fail_lockedalloc:
- inode_unlock(d_inode(root));
- kfree(private);
+fail_private:
+ free_pstore_private(private);
fail_alloc:
iput(inode);
@@ -415,6 +423,27 @@ fail:
return rc;
}
+/*
+ * Read all the records from the persistent store. Create
+ * files in our filesystem. Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
+ */
+void pstore_get_records(int quiet)
+{
+ struct pstore_info *psi = psinfo;
+ struct dentry *root;
+
+ if (!psi || !pstore_sb)
+ return;
+
+ root = pstore_sb->s_root;
+
+ inode_lock(d_inode(root));
+ pstore_get_backend_records(psi, root, quiet);
+ inode_unlock(d_inode(root));
+}
+
static int pstore_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index da416e6591c9..c416e653dc4f 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -25,10 +25,10 @@ extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
extern void pstore_get_records(int);
-extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
- int count, char *data, bool compressed,
- size_t size, struct timespec time,
- struct pstore_info *psi);
+extern void pstore_get_backend_records(struct pstore_info *psi,
+ struct dentry *root, int quiet);
+extern int pstore_mkfile(struct dentry *root,
+ struct pstore_record *record);
extern bool pstore_is_mounted(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index efab7b64925b..d468eec9b8a6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -267,7 +267,7 @@ static void free_zlib(void)
big_oops_buf_sz = 0;
}
-static struct pstore_zbackend backend_zlib = {
+static const struct pstore_zbackend backend_zlib = {
.compress = compress_zlib,
.decompress = decompress_zlib,
.allocate = allocate_zlib,
@@ -328,7 +328,7 @@ static void free_lzo(void)
big_oops_buf_sz = 0;
}
-static struct pstore_zbackend backend_lzo = {
+static const struct pstore_zbackend backend_lzo = {
.compress = compress_lzo,
.decompress = decompress_lzo,
.allocate = allocate_lzo,
@@ -393,7 +393,7 @@ static void free_lz4(void)
big_oops_buf_sz = 0;
}
-static struct pstore_zbackend backend_lz4 = {
+static const struct pstore_zbackend backend_lz4 = {
.compress = compress_lz4,
.decompress = decompress_lz4,
.allocate = allocate_lz4,
@@ -402,7 +402,7 @@ static struct pstore_zbackend backend_lz4 = {
};
#endif
-static struct pstore_zbackend *zbackend =
+static const struct pstore_zbackend *zbackend =
#if defined(CONFIG_PSTORE_ZLIB_COMPRESS)
&backend_zlib;
#elif defined(CONFIG_PSTORE_LZO_COMPRESS)
@@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper,
{
unsigned long total = 0;
const char *why;
- u64 id;
unsigned int part = 1;
unsigned long flags = 0;
int is_locked;
@@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper,
oopscount++;
while (total < kmsg_bytes) {
char *dst;
- unsigned long size;
- int hsize;
+ size_t dst_size;
+ int header_size;
int zipped_len = -1;
- size_t len;
- bool compressed = false;
- size_t total_len;
+ size_t dump_size;
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_DMESG,
+ .count = oopscount,
+ .reason = reason,
+ .part = part,
+ .compressed = false,
+ .buf = psinfo->buf,
+ .psi = psinfo,
+ };
if (big_oops_buf && is_locked) {
dst = big_oops_buf;
- size = big_oops_buf_sz;
+ dst_size = big_oops_buf_sz;
} else {
dst = psinfo->buf;
- size = psinfo->bufsize;
+ dst_size = psinfo->bufsize;
}
- hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part);
- size -= hsize;
+ /* Write dump header. */
+ header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
+ oopscount, part);
+ dst_size -= header_size;
- if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
- size, &len))
+ /* Write dump contents. */
+ if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+ dst_size, &dump_size))
break;
if (big_oops_buf && is_locked) {
zipped_len = pstore_compress(dst, psinfo->buf,
- hsize + len, psinfo->bufsize);
+ header_size + dump_size,
+ psinfo->bufsize);
if (zipped_len > 0) {
- compressed = true;
- total_len = zipped_len;
+ record.compressed = true;
+ record.size = zipped_len;
} else {
- total_len = copy_kmsg_to_buffer(hsize, len);
+ record.size = copy_kmsg_to_buffer(header_size,
+ dump_size);
}
} else {
- total_len = hsize + len;
+ record.size = header_size + dump_size;
}
- ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
- oopscount, compressed, total_len, psinfo);
+ ret = psinfo->write(&record);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
- total += total_len;
+ total += record.size;
part++;
}
if (is_locked)
@@ -577,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
const char *e = s + c;
while (s < e) {
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_CONSOLE,
+ .psi = psinfo,
+ };
unsigned long flags;
- u64 id;
if (c > psinfo->bufsize)
c = psinfo->bufsize;
@@ -589,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
- psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
- s, 0, c, psinfo);
+ record.buf = (char *)s;
+ record.size = c;
+ psinfo->write(&record);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
@@ -618,48 +632,30 @@ static void pstore_register_console(void) {}
static void pstore_unregister_console(void) {}
#endif
-static int pstore_write_compat(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part, int count,
- bool compressed, size_t size,
- struct pstore_info *psi)
-{
- return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
- size, psi);
-}
-
-static int pstore_write_buf_user_compat(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char __user *buf,
- bool compressed, size_t size,
- struct pstore_info *psi)
-{
- unsigned long flags = 0;
- size_t i, bufsize = size;
- long ret = 0;
-
- if (unlikely(!access_ok(VERIFY_READ, buf, size)))
- return -EFAULT;
- if (bufsize > psinfo->bufsize)
- bufsize = psinfo->bufsize;
- spin_lock_irqsave(&psinfo->buf_lock, flags);
- for (i = 0; i < size; ) {
- size_t c = min(size - i, bufsize);
-
- ret = __copy_from_user(psinfo->buf, buf + i, c);
- if (unlikely(ret != 0)) {
- ret = -EFAULT;
- break;
- }
- ret = psi->write_buf(type, reason, id, part, psinfo->buf,
- compressed, c, psi);
- if (unlikely(ret < 0))
- break;
- i += c;
+static int pstore_write_user_compat(struct pstore_record *record,
+ const char __user *buf)
+{
+ int ret = 0;
+
+ if (record->buf)
+ return -EINVAL;
+
+ record->buf = kmalloc(record->size, GFP_KERNEL);
+ if (!record->buf)
+ return -ENOMEM;
+
+ if (unlikely(copy_from_user(record->buf, buf, record->size))) {
+ ret = -EFAULT;
+ goto out;
}
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
- return unlikely(ret < 0) ? ret : size;
+
+ ret = record->psi->write(record);
+
+out:
+ kfree(record->buf);
+ record->buf = NULL;
+
+ return unlikely(ret < 0) ? ret : record->size;
}
/*
@@ -673,19 +669,35 @@ int pstore_register(struct pstore_info *psi)
{
struct module *owner = psi->owner;
- if (backend && strcmp(backend, psi->name))
+ if (backend && strcmp(backend, psi->name)) {
+ pr_warn("ignoring unexpected backend '%s'\n", psi->name);
return -EPERM;
+ }
+
+ /* Sanity check flags. */
+ if (!psi->flags) {
+ pr_warn("backend '%s' must support at least one frontend\n",
+ psi->name);
+ return -EINVAL;
+ }
+
+ /* Check for required functions. */
+ if (!psi->read || !psi->write) {
+ pr_warn("backend '%s' must implement read() and write()\n",
+ psi->name);
+ return -EINVAL;
+ }
spin_lock(&pstore_lock);
if (psinfo) {
+ pr_warn("backend '%s' already loaded: ignoring '%s'\n",
+ psinfo->name, psi->name);
spin_unlock(&pstore_lock);
return -EBUSY;
}
- if (!psi->write)
- psi->write = pstore_write_compat;
- if (!psi->write_buf_user)
- psi->write_buf_user = pstore_write_buf_user_compat;
+ if (!psi->write_user)
+ psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
spin_unlock(&pstore_lock);
@@ -709,6 +721,7 @@ int pstore_register(struct pstore_info *psi)
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_register_pmsg();
+ /* Start watching for new records, if desired. */
if (pstore_update_ms >= 0) {
pstore_timer.expires = jiffies +
msecs_to_jiffies(pstore_update_ms);
@@ -721,16 +734,21 @@ int pstore_register(struct pstore_info *psi)
*/
backend = psi->name;
- module_put(owner);
-
pr_info("Registered %s as persistent store backend\n", psi->name);
+ module_put(owner);
+
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
void pstore_unregister(struct pstore_info *psi)
{
+ /* Stop timer and make sure all work has finished. */
+ pstore_update_ms = -1;
+ del_timer_sync(&pstore_timer);
+ flush_work(&pstore_work);
+
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_unregister_pmsg();
if (psi->flags & PSTORE_FLAGS_FTRACE)
@@ -747,66 +765,99 @@ void pstore_unregister(struct pstore_info *psi)
}
EXPORT_SYMBOL_GPL(pstore_unregister);
+static void decompress_record(struct pstore_record *record)
+{
+ int unzipped_len;
+ char *decompressed;
+
+ /* Only PSTORE_TYPE_DMESG support compression. */
+ if (!record->compressed || record->type != PSTORE_TYPE_DMESG) {
+ pr_warn("ignored compressed record type %d\n", record->type);
+ return;
+ }
+
+ /* No compression method has created the common buffer. */
+ if (!big_oops_buf) {
+ pr_warn("no decompression buffer allocated\n");
+ return;
+ }
+
+ unzipped_len = pstore_decompress(record->buf, big_oops_buf,
+ record->size, big_oops_buf_sz);
+ if (unzipped_len <= 0) {
+ pr_err("decompression failed: %d\n", unzipped_len);
+ return;
+ }
+
+ /* Build new buffer for decompressed contents. */
+ decompressed = kmalloc(unzipped_len + record->ecc_notice_size,
+ GFP_KERNEL);
+ if (!decompressed) {
+ pr_err("decompression ran out of memory\n");
+ return;
+ }
+ memcpy(decompressed, big_oops_buf, unzipped_len);
+
+ /* Append ECC notice to decompressed buffer. */
+ memcpy(decompressed + unzipped_len, record->buf + record->size,
+ record->ecc_notice_size);
+
+ /* Swap out compresed contents with decompressed contents. */
+ kfree(record->buf);
+ record->buf = decompressed;
+ record->size = unzipped_len;
+ record->compressed = false;
+}
+
/*
- * Read all the records from the persistent store. Create
+ * Read all the records from one persistent store backend. Create
* files in our filesystem. Don't warn about -EEXIST errors
* when we are re-scanning the backing store looking to add new
* error records.
*/
-void pstore_get_records(int quiet)
-{
- struct pstore_info *psi = psinfo;
- char *buf = NULL;
- ssize_t size;
- u64 id;
- int count;
- enum pstore_type_id type;
- struct timespec time;
- int failed = 0, rc;
- bool compressed;
- int unzipped_len = -1;
- ssize_t ecc_notice_size = 0;
-
- if (!psi)
+void pstore_get_backend_records(struct pstore_info *psi,
+ struct dentry *root, int quiet)
+{
+ int failed = 0;
+
+ if (!psi || !root)
return;
mutex_lock(&psi->read_mutex);
if (psi->open && psi->open(psi))
goto out;
- while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
- &ecc_notice_size, psi)) > 0) {
- if (compressed && (type == PSTORE_TYPE_DMESG)) {
- if (big_oops_buf)
- unzipped_len = pstore_decompress(buf,
- big_oops_buf, size,
- big_oops_buf_sz);
-
- if (unzipped_len > 0) {
- if (ecc_notice_size)
- memcpy(big_oops_buf + unzipped_len,
- buf + size, ecc_notice_size);
- kfree(buf);
- buf = big_oops_buf;
- size = unzipped_len;
- compressed = false;
- } else {
- pr_err("decompression failed;returned %d\n",
- unzipped_len);
- compressed = true;
- }
+ /*
+ * Backend callback read() allocates record.buf. decompress_record()
+ * may reallocate record.buf. On success, pstore_mkfile() will keep
+ * the record.buf, so free it only on failure.
+ */
+ for (;;) {
+ struct pstore_record *record;
+ int rc;
+
+ record = kzalloc(sizeof(*record), GFP_KERNEL);
+ if (!record) {
+ pr_err("out of memory creating record\n");
+ break;
+ }
+ record->psi = psi;
+
+ record->size = psi->read(record);
+
+ /* No more records left in backend? */
+ if (record->size <= 0)
+ break;
+
+ decompress_record(record);
+ rc = pstore_mkfile(root, record);
+ if (rc) {
+ /* pstore_mkfile() did not take record, so free it. */
+ kfree(record->buf);
+ kfree(record);
+ if (rc != -EEXIST || !quiet)
+ failed++;
}
- rc = pstore_mkfile(type, psi->name, id, count, buf,
- compressed, size + ecc_notice_size,
- time, psi);
- if (unzipped_len < 0) {
- /* Free buffer other than big oops */
- kfree(buf);
- buf = NULL;
- } else
- unzipped_len = -1;
- if (rc && (rc != -EEXIST || !quiet))
- failed++;
}
if (psi->close)
psi->close(psi);
@@ -830,7 +881,9 @@ static void pstore_timefunc(unsigned long dummy)
schedule_work(&pstore_work);
}
- mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
+ if (pstore_update_ms >= 0)
+ mod_timer(&pstore_timer,
+ jiffies + msecs_to_jiffies(pstore_update_ms));
}
module_param(backend, charp, 0444);
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 78f6176c020f..209755e0d7c8 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -15,7 +15,6 @@
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include "internal.h"
static DEFINE_MUTEX(pmsg_lock);
@@ -23,19 +22,22 @@ static DEFINE_MUTEX(pmsg_lock);
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- u64 id;
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_PMSG,
+ .size = count,
+ .psi = psinfo,
+ };
int ret;
if (!count)
return 0;
- /* check outside lock, page in any data. write_buf_user also checks */
+ /* check outside lock, page in any data. write_user also checks */
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
mutex_lock(&pmsg_lock);
- ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
- psinfo);
+ ret = psinfo->write_user(&record, buf);
mutex_unlock(&pmsg_lock);
return ret ? ret : count;
}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 11f918d34b1e..5523df7f17ef 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
return 0;
}
-static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
- int *count, struct timespec *time,
- char **buf, bool *compressed,
- ssize_t *ecc_notice_size,
- struct pstore_info *psi)
+static ssize_t ramoops_pstore_read(struct pstore_record *record)
{
ssize_t size = 0;
- struct ramoops_context *cxt = psi->data;
+ struct ramoops_context *cxt = record->psi->data;
struct persistent_ram_zone *prz = NULL;
int header_length = 0;
bool free_prz = false;
- /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
+ /*
+ * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
* PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
* valid time stamps, so it is initialized to zero.
*/
- time->tv_sec = 0;
- time->tv_nsec = 0;
- *compressed = false;
+ record->time.tv_sec = 0;
+ record->time.tv_nsec = 0;
+ record->compressed = false;
/* Find the next valid persistent_ram_zone for DMESG */
while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
- cxt->max_dump_cnt, id, type,
+ cxt->max_dump_cnt, &record->id,
+ &record->type,
PSTORE_TYPE_DMESG, 1);
if (!prz_ok(prz))
continue;
header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
- time, compressed);
+ &record->time,
+ &record->compressed);
/* Clear and skip this DMESG record if it has no valid header */
if (!header_length) {
persistent_ram_free_old(prz);
@@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
- 1, id, type, PSTORE_TYPE_CONSOLE, 0);
+ 1, &record->id, &record->type,
+ PSTORE_TYPE_CONSOLE, 0);
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
- 1, id, type, PSTORE_TYPE_PMSG, 0);
+ 1, &record->id, &record->type,
+ PSTORE_TYPE_PMSG, 0);
/* ftrace is last since it may want to dynamically allocate memory. */
if (!prz_ok(prz)) {
if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
prz = ramoops_get_next_prz(cxt->fprzs,
- &cxt->ftrace_read_cnt, 1, id, type,
- PSTORE_TYPE_FTRACE, 0);
+ &cxt->ftrace_read_cnt, 1, &record->id,
+ &record->type, PSTORE_TYPE_FTRACE, 0);
} else {
/*
* Build a new dummy record which combines all the
@@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
prz_next = ramoops_get_next_prz(cxt->fprzs,
&cxt->ftrace_read_cnt,
- cxt->max_ftrace_cnt, id,
- type, PSTORE_TYPE_FTRACE, 0);
+ cxt->max_ftrace_cnt,
+ &record->id,
+ &record->type,
+ PSTORE_TYPE_FTRACE, 0);
if (!prz_ok(prz_next))
continue;
@@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (size)
goto out;
}
- *id = 0;
+ record->id = 0;
prz = tmp_prz;
}
}
@@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
size = persistent_ram_old_size(prz) - header_length;
/* ECC correction notice */
- *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+ record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
- *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
- if (*buf == NULL) {
+ record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
+ if (record->buf == NULL) {
size = -ENOMEM;
goto out;
}
- memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+ memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length,
+ size);
- persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
+ persistent_ram_ecc_string(prz, record->buf + size,
+ record->ecc_notice_size + 1);
out:
if (free_prz) {
@@ -373,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
return len;
}
-static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char *buf,
- bool compressed, size_t size,
- struct pstore_info *psi)
+static int notrace ramoops_pstore_write(struct pstore_record *record)
{
- struct ramoops_context *cxt = psi->data;
+ struct ramoops_context *cxt = record->psi->data;
struct persistent_ram_zone *prz;
- size_t hlen;
+ size_t size, hlen;
- if (type == PSTORE_TYPE_CONSOLE) {
+ if (record->type == PSTORE_TYPE_CONSOLE) {
if (!cxt->cprz)
return -ENOMEM;
- persistent_ram_write(cxt->cprz, buf, size);
+ persistent_ram_write(cxt->cprz, record->buf, record->size);
return 0;
- } else if (type == PSTORE_TYPE_FTRACE) {
+ } else if (record->type == PSTORE_TYPE_FTRACE) {
int zonenum;
if (!cxt->fprzs)
@@ -402,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
else
zonenum = 0;
- persistent_ram_write(cxt->fprzs[zonenum], buf, size);
+ persistent_ram_write(cxt->fprzs[zonenum], record->buf,
+ record->size);
return 0;
- } else if (type == PSTORE_TYPE_PMSG) {
+ } else if (record->type == PSTORE_TYPE_PMSG) {
pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
return -EINVAL;
}
- if (type != PSTORE_TYPE_DMESG)
+ if (record->type != PSTORE_TYPE_DMESG)
return -EINVAL;
- /* Out of the various dmesg dump types, ramoops is currently designed
+ /*
+ * Out of the various dmesg dump types, ramoops is currently designed
* to only store crash logs, rather than storing general kernel logs.
*/
- if (reason != KMSG_DUMP_OOPS &&
- reason != KMSG_DUMP_PANIC)
+ if (record->reason != KMSG_DUMP_OOPS &&
+ record->reason != KMSG_DUMP_PANIC)
return -EINVAL;
/* Skip Oopes when configured to do so. */
- if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
+ if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
return -EINVAL;
- /* Explicitly only take the first part of any new crash.
+ /*
+ * Explicitly only take the first part of any new crash.
* If our buffer is larger than kmsg_bytes, this can never happen,
* and if our buffer is smaller than kmsg_bytes, we don't want the
* report split across multiple records.
*/
- if (part != 1)
+ if (record->part != 1)
return -ENOSPC;
if (!cxt->dprzs)
@@ -436,53 +439,50 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
prz = cxt->dprzs[cxt->dump_write_cnt];
- hlen = ramoops_write_kmsg_hdr(prz, compressed);
+ /* Build header and append record contents. */
+ hlen = ramoops_write_kmsg_hdr(prz, record->compressed);
+ size = record->size;
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
- persistent_ram_write(prz, buf, size);
+ persistent_ram_write(prz, record->buf, size);
cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
return 0;
}
-static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char __user *buf,
- bool compressed, size_t size,
- struct pstore_info *psi)
+static int notrace ramoops_pstore_write_user(struct pstore_record *record,
+ const char __user *buf)
{
- if (type == PSTORE_TYPE_PMSG) {
- struct ramoops_context *cxt = psi->data;
+ if (record->type == PSTORE_TYPE_PMSG) {
+ struct ramoops_context *cxt = record->psi->data;
if (!cxt->mprz)
return -ENOMEM;
- return persistent_ram_write_user(cxt->mprz, buf, size);
+ return persistent_ram_write_user(cxt->mprz, buf, record->size);
}
return -EINVAL;
}
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
- struct timespec time, struct pstore_info *psi)
+static int ramoops_pstore_erase(struct pstore_record *record)
{
- struct ramoops_context *cxt = psi->data;
+ struct ramoops_context *cxt = record->psi->data;
struct persistent_ram_zone *prz;
- switch (type) {
+ switch (record->type) {
case PSTORE_TYPE_DMESG:
- if (id >= cxt->max_dump_cnt)
+ if (record->id >= cxt->max_dump_cnt)
return -EINVAL;
- prz = cxt->dprzs[id];
+ prz = cxt->dprzs[record->id];
break;
case PSTORE_TYPE_CONSOLE:
prz = cxt->cprz;
break;
case PSTORE_TYPE_FTRACE:
- if (id >= cxt->max_ftrace_cnt)
+ if (record->id >= cxt->max_ftrace_cnt)
return -EINVAL;
- prz = cxt->fprzs[id];
+ prz = cxt->fprzs[record->id];
break;
case PSTORE_TYPE_PMSG:
prz = cxt->mprz;
@@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = {
.name = "ramoops",
.open = ramoops_pstore_open,
.read = ramoops_pstore_read,
- .write_buf = ramoops_pstore_write_buf,
- .write_buf_user = ramoops_pstore_write_buf_user,
+ .write = ramoops_pstore_write,
+ .write_user = ramoops_pstore_write_user,
.erase = ramoops_pstore_erase,
},
};
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index bc927e30bdcc..e11672aa4575 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -532,7 +532,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
}
/* Initialize general buffer state. */
- prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+ raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
ret = persistent_ram_buffer_map(start, size, prz, memtype);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 74b489e3714d..ebf80c7739e1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2188,8 +2188,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
/* This can happen when suspending quotas on remount-ro... */
if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
inode_lock(toputinode[cnt]);
- toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
- S_NOATIME | S_NOQUOTA);
+ toputinode[cnt]->i_flags &= ~S_NOQUOTA;
truncate_inode_pages(&toputinode[cnt]->i_data, 0);
inode_unlock(toputinode[cnt]);
mark_inode_dirty_sync(toputinode[cnt]);
@@ -2237,7 +2236,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
int error;
- int oldflags = -1;
if (!fmt)
return -ESRCH;
@@ -2285,9 +2283,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
inode_lock(inode);
- oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
- S_NOQUOTA);
- inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+ inode->i_flags |= S_NOQUOTA;
inode_unlock(inode);
/*
* When S_NOQUOTA is set, remove dquot references as no more
@@ -2329,14 +2325,9 @@ out_file_init:
dqopt->files[type] = NULL;
iput(inode);
out_file_flags:
- if (oldflags != -1) {
- inode_lock(inode);
- /* Set the flags back (in the case of accidental quotaon()
- * on a wrong file we don't want to mess up the flags) */
- inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
- inode->i_flags |= oldflags;
- inode_unlock(inode);
- }
+ inode_lock(inode);
+ inode->i_flags &= ~S_NOQUOTA;
+ inode_unlock(inode);
out_fmt:
put_quota_format(fmt);
@@ -2780,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
}
EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops dquot_quotactl_ops = {
- .quota_on = dquot_quota_on,
- .quota_off = dquot_quota_off,
- .quota_sync = dquot_quota_sync,
- .get_state = dquot_get_state,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .get_nextdqblk = dquot_get_next_dqblk,
- .set_dqblk = dquot_set_dqblk
-};
-EXPORT_SYMBOL(dquot_quotactl_ops);
-
const struct quotactl_ops dquot_quotactl_sysfile_ops = {
.quota_enable = dquot_quota_enable,
.quota_disable = dquot_quota_disable,
diff --git a/fs/read_write.c b/fs/read_write.c
index c4f88afbc67f..47c1d4484df9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -841,6 +841,81 @@ out:
return ret;
}
+#ifdef CONFIG_COMPAT
+ssize_t compat_rw_copy_check_uvector(int type,
+ const struct compat_iovec __user *uvector, unsigned long nr_segs,
+ unsigned long fast_segs, struct iovec *fast_pointer,
+ struct iovec **ret_pointer)
+{
+ compat_ssize_t tot_len;
+ struct iovec *iov = *ret_pointer = fast_pointer;
+ ssize_t ret = 0;
+ int seg;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ if (nr_segs == 0)
+ goto out;
+
+ ret = -EINVAL;
+ if (nr_segs > UIO_MAXIOV)
+ goto out;
+ if (nr_segs > fast_segs) {
+ ret = -ENOMEM;
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (iov == NULL)
+ goto out;
+ }
+ *ret_pointer = iov;
+
+ ret = -EFAULT;
+ if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+ goto out;
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 and fitting an
+ * ssize_t.
+ *
+ * In Linux, the total length is limited to MAX_RW_COUNT, there is
+ * no overflow possibility.
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0; seg < nr_segs; seg++) {
+ compat_uptr_t buf;
+ compat_ssize_t len;
+
+ if (__get_user(len, &uvector->iov_len) ||
+ __get_user(buf, &uvector->iov_base)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (len < 0) /* size_t not fitting in compat_ssize_t .. */
+ goto out;
+ if (type >= 0 &&
+ !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (len > MAX_RW_COUNT - tot_len)
+ len = MAX_RW_COUNT - tot_len;
+ tot_len += len;
+ iov->iov_base = compat_ptr(buf);
+ iov->iov_len = (compat_size_t) len;
+ uvector++;
+ iov++;
+ }
+ ret = tot_len;
+
+out:
+ return ret;
+}
+#endif
+
static ssize_t __do_readv_writev(int type, struct file *file,
struct iov_iter *iter, loff_t *pos, int flags)
{
diff --git a/fs/readdir.c b/fs/readdir.c
index 0e8a7f355f7a..89659549c09d 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -18,6 +18,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -324,3 +325,167 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
fdput_pos(f);
return error;
}
+
+#ifdef CONFIG_COMPAT
+struct compat_old_linux_dirent {
+ compat_ulong_t d_ino;
+ compat_ulong_t d_offset;
+ unsigned short d_namlen;
+ char d_name[1];
+};
+
+struct compat_readdir_callback {
+ struct dir_context ctx;
+ struct compat_old_linux_dirent __user *dirent;
+ int result;
+};
+
+static int compat_fillonedir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct compat_readdir_callback *buf =
+ container_of(ctx, struct compat_readdir_callback, ctx);
+ struct compat_old_linux_dirent __user *dirent;
+ compat_ulong_t d_ino;
+
+ if (buf->result)
+ return -EINVAL;
+ d_ino = ino;
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->result = -EOVERFLOW;
+ return -EOVERFLOW;
+ }
+ buf->result++;
+ dirent = buf->dirent;
+ if (!access_ok(VERIFY_WRITE, dirent,
+ (unsigned long)(dirent->d_name + namlen + 1) -
+ (unsigned long)dirent))
+ goto efault;
+ if ( __put_user(d_ino, &dirent->d_ino) ||
+ __put_user(offset, &dirent->d_offset) ||
+ __put_user(namlen, &dirent->d_namlen) ||
+ __copy_to_user(dirent->d_name, name, namlen) ||
+ __put_user(0, dirent->d_name + namlen))
+ goto efault;
+ return 0;
+efault:
+ buf->result = -EFAULT;
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+ struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
+{
+ int error;
+ struct fd f = fdget_pos(fd);
+ struct compat_readdir_callback buf = {
+ .ctx.actor = compat_fillonedir,
+ .dirent = dirent
+ };
+
+ if (!f.file)
+ return -EBADF;
+
+ error = iterate_dir(f.file, &buf.ctx);
+ if (buf.result)
+ error = buf.result;
+
+ fdput_pos(f);
+ return error;
+}
+
+struct compat_linux_dirent {
+ compat_ulong_t d_ino;
+ compat_ulong_t d_off;
+ unsigned short d_reclen;
+ char d_name[1];
+};
+
+struct compat_getdents_callback {
+ struct dir_context ctx;
+ struct compat_linux_dirent __user *current_dir;
+ struct compat_linux_dirent __user *previous;
+ int count;
+ int error;
+};
+
+static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct compat_linux_dirent __user * dirent;
+ struct compat_getdents_callback *buf =
+ container_of(ctx, struct compat_getdents_callback, ctx);
+ compat_ulong_t d_ino;
+ int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+ namlen + 2, sizeof(compat_long_t));
+
+ buf->error = -EINVAL; /* only used if we fail.. */
+ if (reclen > buf->count)
+ return -EINVAL;
+ d_ino = ino;
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->error = -EOVERFLOW;
+ return -EOVERFLOW;
+ }
+ dirent = buf->previous;
+ if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
+ if (__put_user(offset, &dirent->d_off))
+ goto efault;
+ }
+ dirent = buf->current_dir;
+ if (__put_user(d_ino, &dirent->d_ino))
+ goto efault;
+ if (__put_user(reclen, &dirent->d_reclen))
+ goto efault;
+ if (copy_to_user(dirent->d_name, name, namlen))
+ goto efault;
+ if (__put_user(0, dirent->d_name + namlen))
+ goto efault;
+ if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+ goto efault;
+ buf->previous = dirent;
+ dirent = (void __user *)dirent + reclen;
+ buf->current_dir = dirent;
+ buf->count -= reclen;
+ return 0;
+efault:
+ buf->error = -EFAULT;
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
+ struct compat_linux_dirent __user *, dirent, unsigned int, count)
+{
+ struct fd f;
+ struct compat_linux_dirent __user * lastdirent;
+ struct compat_getdents_callback buf = {
+ .ctx.actor = compat_filldir,
+ .current_dir = dirent,
+ .count = count
+ };
+ int error;
+
+ if (!access_ok(VERIFY_WRITE, dirent, count))
+ return -EFAULT;
+
+ f = fdget_pos(fd);
+ if (!f.file)
+ return -EBADF;
+
+ error = iterate_dir(f.file, &buf.ctx);
+ if (error >= 0)
+ error = buf.error;
+ lastdirent = buf.previous;
+ if (lastdirent) {
+ if (put_user(buf.ctx.pos, &lastdirent->d_off))
+ error = -EFAULT;
+ else
+ error = count - buf.count;
+ }
+ fdput_pos(f);
+ return error;
+}
+#endif
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6ab9d64ea1b..873fc04e9403 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1375,7 +1375,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
static void inode2sd(void *sd, struct inode *inode, loff_t size)
{
struct stat_data *sd_v2 = (struct stat_data *)sd;
- __u16 flags;
set_sd_v2_mode(sd_v2, inode->i_mode);
set_sd_v2_nlink(sd_v2, inode->i_nlink);
@@ -1390,9 +1389,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
else
set_sd_v2_generation(sd_v2, inode->i_generation);
- flags = REISERFS_I(inode)->i_attrs;
- i_attrs_to_sd_attrs(inode, &flags);
- set_sd_v2_attrs(sd_v2, flags);
+ set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
}
/* used to copy inode's fields to old stat data */
@@ -2002,10 +1999,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
/* uid and gid must already be set by the caller for quota init */
- /* symlink cannot be immutable or append only, right? */
- if (S_ISLNK(inode->i_mode))
- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
-
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
@@ -3095,28 +3088,6 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
}
}
-void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
-{
- if (reiserfs_attrs(inode->i_sb)) {
- if (inode->i_flags & S_IMMUTABLE)
- *sd_attrs |= REISERFS_IMMUTABLE_FL;
- else
- *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
- if (inode->i_flags & S_SYNC)
- *sd_attrs |= REISERFS_SYNC_FL;
- else
- *sd_attrs &= ~REISERFS_SYNC_FL;
- if (inode->i_flags & S_NOATIME)
- *sd_attrs |= REISERFS_NOATIME_FL;
- else
- *sd_attrs &= ~REISERFS_NOATIME_FL;
- if (REISERFS_I(inode)->i_flags & i_nopack_mask)
- *sd_attrs |= REISERFS_NOTAIL_FL;
- else
- *sd_attrs &= ~REISERFS_NOTAIL_FL;
- }
-}
-
/*
* decide if this buffer needs to stay around for data logging or ordered
* write purposes
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 1f4692a505a0..acbbaf7a0bb2 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -47,7 +47,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
flags = REISERFS_I(inode)->i_attrs;
- i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
err = put_user(flags, (int __user *)arg);
break;
case REISERFS_IOC_SETFLAGS:{
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index aca73dd73906..e3c558d1b78c 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi)
}
static struct item_operations errcatch_ops = {
- errcatch_bytes_number,
- errcatch_decrement_key,
- errcatch_is_left_mergeable,
- errcatch_print_item,
- errcatch_check_item,
-
- errcatch_create_vi,
- errcatch_check_left,
- errcatch_check_right,
- errcatch_part_size,
- errcatch_unit_num,
- errcatch_print_vi
+ .bytes_number = errcatch_bytes_number,
+ .decrement_key = errcatch_decrement_key,
+ .is_left_mergeable = errcatch_is_left_mergeable,
+ .print_item = errcatch_print_item,
+ .check_item = errcatch_check_item,
+
+ .create_vi = errcatch_create_vi,
+ .check_left = errcatch_check_left,
+ .check_right = errcatch_check_right,
+ .part_size = errcatch_part_size,
+ .unit_num = errcatch_unit_num,
+ .print_vi = errcatch_print_vi
};
#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index aa40c242f1db..da01f497180a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1961,7 +1961,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
* will be requeued because superblock is being shutdown and doesn't
* have MS_ACTIVE set.
*/
- cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
+ reiserfs_cancel_old_flush(sb);
/* wait for all commits to finish */
cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 249594a821e0..f5cebd70d903 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -475,7 +475,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
* 'cpy_bytes'; create new item header;
* n_ih = new item_header;
*/
- memcpy(&n_ih, ih, SHORT_KEY_SIZE);
+ memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
/* Endian safe, both le */
n_ih.ih_version = ih->ih_version;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 2adcde137c3f..1d34377fef97 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1326,7 +1326,6 @@ struct cpu_key {
#define KEY_NOT_FOUND 0
#define KEY_SIZE (sizeof(struct reiserfs_key))
-#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32))
/* return values for search_by_key and clones */
#define ITEM_FOUND 1
@@ -2949,6 +2948,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *s,
struct reiserfs_list_bitmap *, unsigned int);
void reiserfs_schedule_old_flush(struct super_block *s);
+void reiserfs_cancel_old_flush(struct super_block *s);
void add_save_link(struct reiserfs_transaction_handle *th,
struct inode *inode, int truncate);
int remove_save_link(struct inode *inode, int truncate);
@@ -3099,7 +3099,6 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
}
void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index feabcde0290d..685f1e056998 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -89,11 +89,27 @@ static void flush_old_commits(struct work_struct *work)
sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
s = sbi->s_journal->j_work_sb;
+ /*
+ * We need s_umount for protecting quota writeback. We have to use
+ * trylock as reiserfs_cancel_old_flush() may be waiting for this work
+ * to complete with s_umount held.
+ */
+ if (!down_read_trylock(&s->s_umount)) {
+ /* Requeue work if we are not cancelling it */
+ spin_lock(&sbi->old_work_lock);
+ if (sbi->work_queued == 1)
+ queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
+ spin_unlock(&sbi->old_work_lock);
+ return;
+ }
spin_lock(&sbi->old_work_lock);
- sbi->work_queued = 0;
+ /* Avoid clobbering the cancel state... */
+ if (sbi->work_queued == 1)
+ sbi->work_queued = 0;
spin_unlock(&sbi->old_work_lock);
reiserfs_sync_fs(s, 1);
+ up_read(&s->s_umount);
}
void reiserfs_schedule_old_flush(struct super_block *s)
@@ -117,21 +133,22 @@ void reiserfs_schedule_old_flush(struct super_block *s)
spin_unlock(&sbi->old_work_lock);
}
-static void cancel_old_flush(struct super_block *s)
+void reiserfs_cancel_old_flush(struct super_block *s)
{
struct reiserfs_sb_info *sbi = REISERFS_SB(s);
- cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
spin_lock(&sbi->old_work_lock);
- sbi->work_queued = 0;
+ /* Make sure no new flushes will be queued */
+ sbi->work_queued = 2;
spin_unlock(&sbi->old_work_lock);
+ cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
}
static int reiserfs_freeze(struct super_block *s)
{
struct reiserfs_transaction_handle th;
- cancel_old_flush(s);
+ reiserfs_cancel_old_flush(s);
reiserfs_write_lock(s);
if (!(s->s_flags & MS_RDONLY)) {
@@ -152,7 +169,13 @@ static int reiserfs_freeze(struct super_block *s)
static int reiserfs_unfreeze(struct super_block *s)
{
+ struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
reiserfs_allow_writes(s);
+ spin_lock(&sbi->old_work_lock);
+ /* Allow old_work to run again */
+ sbi->work_queued = 0;
+ spin_unlock(&sbi->old_work_lock);
return 0;
}
@@ -547,12 +570,28 @@ static void reiserfs_kill_sb(struct super_block *s)
kill_block_super(s);
}
+#ifdef CONFIG_QUOTA
+static int reiserfs_quota_off(struct super_block *sb, int type);
+
+static void reiserfs_quota_off_umount(struct super_block *s)
+{
+ int type;
+
+ for (type = 0; type < REISERFS_MAXQUOTAS; type++)
+ reiserfs_quota_off(s, type);
+}
+#else
+static inline void reiserfs_quota_off_umount(struct super_block *s)
+{
+}
+#endif
+
static void reiserfs_put_super(struct super_block *s)
{
struct reiserfs_transaction_handle th;
th.t_trans_id = 0;
- dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ reiserfs_quota_off_umount(s);
reiserfs_write_lock(s);
@@ -817,7 +856,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
- .quota_off = dquot_quota_off,
+ .quota_off = reiserfs_quota_off,
.quota_sync = dquot_quota_sync,
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
@@ -2194,7 +2233,7 @@ error_unlocked:
if (sbi->commit_wq)
destroy_workqueue(sbi->commit_wq);
- cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+ reiserfs_cancel_old_flush(s);
reiserfs_free_bitmap_cache(s);
if (SB_BUFFER_WITH_SB(s))
@@ -2405,12 +2444,47 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
goto out;
}
reiserfs_write_unlock(sb);
- return dquot_quota_on(sb, type, format_id, path);
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (!err) {
+ inode_lock(inode);
+ REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
+ REISERFS_NOATIME_FL;
+ inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
+ S_IMMUTABLE | S_NOATIME);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+ }
+ return err;
out:
reiserfs_write_unlock(sb);
return err;
}
+static int reiserfs_quota_off(struct super_block *sb, int type)
+{
+ int err;
+ struct inode *inode = sb_dqopt(sb)->files[type];
+
+ if (!inode || !igrab(inode))
+ goto out;
+
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
+ REISERFS_NOATIME_FL);
+ inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+out_put:
+ iput(inode);
+ return err;
+out:
+ return dquot_quota_off(sb, type);
+}
+
/*
* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
diff --git a/fs/select.c b/fs/select.c
index e2112270d75a..d6c652a31e99 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -338,6 +338,53 @@ sticky:
return ret;
}
+/*
+ * Scalable version of the fd_set.
+ */
+
+typedef struct {
+ unsigned long *in, *out, *ex;
+ unsigned long *res_in, *res_out, *res_ex;
+} fd_set_bits;
+
+/*
+ * How many longwords for "nr" bits?
+ */
+#define FDS_BITPERLONG (8*sizeof(long))
+#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
+#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long))
+
+/*
+ * We do a VERIFY_WRITE here even though we are only reading this time:
+ * we'll write to it eventually..
+ *
+ * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
+ */
+static inline
+int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+ nr = FDS_BYTES(nr);
+ if (ufdset)
+ return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
+
+ memset(fdset, 0, nr);
+ return 0;
+}
+
+static inline unsigned long __must_check
+set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+ if (ufdset)
+ return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
+ return 0;
+}
+
+static inline
+void zero_fd_set(unsigned long nr, unsigned long *fdset)
+{
+ memset(fdset, 0, FDS_BYTES(nr));
+}
+
#define FDS_IN(fds, n) (fds->in + n)
#define FDS_OUT(fds, n) (fds->out + n)
#define FDS_EX(fds, n) (fds->ex + n)
@@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -409,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
int retval, i, timed_out = 0;
u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
- unsigned long busy_end = 0;
+ unsigned long busy_start = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
@@ -512,11 +559,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
- if (!busy_end) {
- busy_end = busy_loop_end_time();
+ if (!busy_start) {
+ busy_start = busy_loop_current_time();
continue;
}
- if (!busy_loop_timeout(busy_end))
+ if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
@@ -586,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
- if (!bits && alloc_size > PAGE_SIZE)
- bits = vmalloc(alloc_size);
-
+ bits = kvmalloc(alloc_size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
@@ -800,7 +844,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
int timed_out = 0, count = 0;
u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
- unsigned long busy_end = 0;
+ unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -853,11 +897,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
- if (!busy_end) {
- busy_end = busy_loop_end_time();
+ if (!busy_start) {
+ busy_start = busy_loop_current_time();
continue;
}
- if (!busy_loop_timeout(busy_end))
+ if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
@@ -881,7 +925,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \
sizeof(struct pollfd))
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
@@ -1053,3 +1097,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return ret;
}
+
+#ifdef CONFIG_COMPAT
+#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
+
+static
+int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+ int timeval, int ret)
+{
+ struct timespec ts;
+
+ if (!p)
+ return ret;
+
+ if (current->personality & STICKY_TIMEOUTS)
+ goto sticky;
+
+ /* No update for zero timeout */
+ if (!end_time->tv_sec && !end_time->tv_nsec)
+ return ret;
+
+ ktime_get_ts(&ts);
+ ts = timespec_sub(*end_time, ts);
+ if (ts.tv_sec < 0)
+ ts.tv_sec = ts.tv_nsec = 0;
+
+ if (timeval) {
+ struct compat_timeval rtv;
+
+ rtv.tv_sec = ts.tv_sec;
+ rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+ if (!copy_to_user(p, &rtv, sizeof(rtv)))
+ return ret;
+ } else {
+ struct compat_timespec rts;
+
+ rts.tv_sec = ts.tv_sec;
+ rts.tv_nsec = ts.tv_nsec;
+
+ if (!copy_to_user(p, &rts, sizeof(rts)))
+ return ret;
+ }
+ /*
+ * If an application puts its timeval in read-only memory, we
+ * don't want the Linux-specific update to the timeval to
+ * cause a fault after the select has completed
+ * successfully. However, because we're not updating the
+ * timeval, we can't restart the system call.
+ */
+
+sticky:
+ if (ret == -ERESTARTNOHAND)
+ ret = -EINTR;
+ return ret;
+}
+
+/*
+ * Ooo, nasty. We need here to frob 32-bit unsigned longs to
+ * 64-bit unsigned longs.
+ */
+static
+int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+ unsigned long *fdset)
+{
+ nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+ if (ufdset) {
+ unsigned long odd;
+
+ if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
+ return -EFAULT;
+
+ odd = nr & 1UL;
+ nr &= ~1UL;
+ while (nr) {
+ unsigned long h, l;
+ if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+ return -EFAULT;
+ ufdset += 2;
+ *fdset++ = h << 32 | l;
+ nr -= 2;
+ }
+ if (odd && __get_user(*fdset, ufdset))
+ return -EFAULT;
+ } else {
+ /* Tricky, must clear full unsigned long in the
+ * kernel fdset at the end, this makes sure that
+ * actually happens.
+ */
+ memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+ }
+ return 0;
+}
+
+static
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+ unsigned long *fdset)
+{
+ unsigned long odd;
+ nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+
+ if (!ufdset)
+ return 0;
+
+ odd = nr & 1UL;
+ nr &= ~1UL;
+ while (nr) {
+ unsigned long h, l;
+ l = *fdset++;
+ h = l >> 32;
+ if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+ return -EFAULT;
+ ufdset += 2;
+ nr -= 2;
+ }
+ if (odd && __put_user(*fdset, ufdset))
+ return -EFAULT;
+ return 0;
+}
+
+
+/*
+ * This is a virtual copy of sys_select from fs/select.c and probably
+ * should be compared to it from time to time
+ */
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+ compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+ struct timespec *end_time)
+{
+ fd_set_bits fds;
+ void *bits;
+ int size, max_fds, ret = -EINVAL;
+ struct fdtable *fdt;
+ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+ if (n < 0)
+ goto out_nofds;
+
+ /* max_fds can increase, so grab it once to avoid race */
+ rcu_read_lock();
+ fdt = files_fdtable(current->files);
+ max_fds = fdt->max_fds;
+ rcu_read_unlock();
+ if (n > max_fds)
+ n = max_fds;
+
+ /*
+ * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+ * since we used fdset we need to allocate memory in units of
+ * long-words.
+ */
+ size = FDS_BYTES(n);
+ bits = stack_fds;
+ if (size > sizeof(stack_fds) / 6) {
+ bits = kmalloc(6 * size, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!bits)
+ goto out_nofds;
+ }
+ fds.in = (unsigned long *) bits;
+ fds.out = (unsigned long *) (bits + size);
+ fds.ex = (unsigned long *) (bits + 2*size);
+ fds.res_in = (unsigned long *) (bits + 3*size);
+ fds.res_out = (unsigned long *) (bits + 4*size);
+ fds.res_ex = (unsigned long *) (bits + 5*size);
+
+ if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
+ (ret = compat_get_fd_set(n, outp, fds.out)) ||
+ (ret = compat_get_fd_set(n, exp, fds.ex)))
+ goto out;
+ zero_fd_set(n, fds.res_in);
+ zero_fd_set(n, fds.res_out);
+ zero_fd_set(n, fds.res_ex);
+
+ ret = do_select(n, &fds, end_time);
+
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = -ERESTARTNOHAND;
+ if (signal_pending(current))
+ goto out;
+ ret = 0;
+ }
+
+ if (compat_set_fd_set(n, inp, fds.res_in) ||
+ compat_set_fd_set(n, outp, fds.res_out) ||
+ compat_set_fd_set(n, exp, fds.res_ex))
+ ret = -EFAULT;
+out:
+ if (bits != stack_fds)
+ kfree(bits);
+out_nofds:
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timeval __user *, tvp)
+{
+ struct timespec end_time, *to = NULL;
+ struct compat_timeval tv;
+ int ret;
+
+ if (tvp) {
+ if (copy_from_user(&tv, tvp, sizeof(tv)))
+ return -EFAULT;
+
+ to = &end_time;
+ if (poll_select_set_timeout(to,
+ tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+ (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+ return -EINVAL;
+ }
+
+ ret = compat_core_sys_select(n, inp, outp, exp, to);
+ ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+ return ret;
+}
+
+struct compat_sel_arg_struct {
+ compat_ulong_t n;
+ compat_uptr_t inp;
+ compat_uptr_t outp;
+ compat_uptr_t exp;
+ compat_uptr_t tvp;
+};
+
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
+{
+ struct compat_sel_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+ compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
+ compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+ struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize)
+{
+ compat_sigset_t ss32;
+ sigset_t ksigmask, sigsaved;
+ struct compat_timespec ts;
+ struct timespec end_time, *to = NULL;
+ int ret;
+
+ if (tsp) {
+ if (copy_from_user(&ts, tsp, sizeof(ts)))
+ return -EFAULT;
+
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &ss32);
+
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = compat_core_sys_select(n, inp, outp, exp, to);
+ ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+ if (ret == -ERESTARTNOHAND) {
+ /*
+ * Don't restore the signal mask yet. Let do_signal() deliver
+ * the signal on the way back to userspace, before the signal
+ * mask is restored.
+ */
+ if (sigmask) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ }
+ } else if (sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timespec __user *, tsp, void __user *, sig)
+{
+ compat_size_t sigsetsize = 0;
+ compat_uptr_t up = 0;
+
+ if (sig) {
+ if (!access_ok(VERIFY_READ, sig,
+ sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+ __get_user(up, (compat_uptr_t __user *)sig) ||
+ __get_user(sigsetsize,
+ (compat_size_t __user *)(sig+sizeof(up))))
+ return -EFAULT;
+ }
+ return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+ sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+ unsigned int, nfds, struct compat_timespec __user *, tsp,
+ const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+ compat_sigset_t ss32;
+ sigset_t ksigmask, sigsaved;
+ struct compat_timespec ts;
+ struct timespec end_time, *to = NULL;
+ int ret;
+
+ if (tsp) {
+ if (copy_from_user(&ts, tsp, sizeof(ts)))
+ return -EFAULT;
+
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &ss32);
+
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = do_sys_poll(ufds, nfds, to);
+
+ /* We can restart this syscall, usually */
+ if (ret == -EINTR) {
+ /*
+ * Don't restore the signal mask yet. Let do_signal() deliver
+ * the signal on the way back to userspace, before the signal
+ * mask is restored.
+ */
+ if (sigmask) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ }
+ ret = -ERESTARTNOHAND;
+ } else if (sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+ return ret;
+}
+#endif
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ca69fb99e41a..dc7c2be963ed 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
- void *buf;
- gfp_t gfp = GFP_KERNEL;
-
- /*
- * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
- * it's better to fall back to vmalloc() than to kill things. For small
- * allocations, just use GFP_KERNEL which will oom kill, thus no need
- * for vmalloc fallback.
- */
- if (size > PAGE_SIZE)
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- buf = kmalloc(size, gfp);
- if (!buf && size > PAGE_SIZE)
- buf = vmalloc(size);
- return buf;
+ return kvmalloc(size, GFP_KERNEL);
}
/**
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..7e3d71109f51 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
/*
* The lockless check can race with remove_wait_queue() in progress,
* but in this case its caller should run under rcu_read_lock() and
- * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+ * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
*/
if (likely(!waitqueue_active(wqh)))
return;
diff --git a/fs/splice.c b/fs/splice.c
index 006ba50f4ece..540c4a44756c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -247,11 +247,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
}
EXPORT_SYMBOL(add_to_pipe);
-void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
-{
- put_page(spd->pages[i]);
-}
-
/*
* Check if we need to grow the arrays holding pages and partial page
* descriptions.
@@ -393,7 +388,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct iov_iter to;
struct page **pages;
unsigned int nr_pages;
- size_t offset, dummy, copied = 0;
+ size_t offset, base, copied = 0;
ssize_t res;
int i;
@@ -408,12 +403,11 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
- res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
+ res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
if (res <= 0)
return -ENOMEM;
- BUG_ON(dummy);
- nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
vec = __vec;
if (nr_pages > PIPE_DEF_BUFFERS) {
@@ -1359,6 +1353,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
struct fd f;
long error;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
if (unlikely(nr_segs > UIO_MAXIOV))
return -EINVAL;
else if (unlikely(!nr_segs))
@@ -1409,6 +1405,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
if (unlikely(!len))
return 0;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
error = -EBADF;
in = fdget(fd_in);
if (in.file) {
@@ -1737,6 +1736,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
struct fd in;
int error;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
if (unlikely(!len))
return 0;
diff --git a/fs/stat.c b/fs/stat.c
index c6c963b2546b..f494b182c7c7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -15,6 +15,7 @@
#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -547,13 +548,13 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
/**
* sys_statx - System call to get enhanced stats
* @dfd: Base directory to pathwalk from *or* fd to stat.
- * @filename: File to stat *or* NULL.
+ * @filename: File to stat or "" with AT_EMPTY_PATH
* @flags: AT_* flags to control pathwalk.
* @mask: Parts of statx struct actually required.
* @buffer: Result buffer.
*
- * Note that if filename is NULL, then it does the equivalent of fstat() using
- * dfd to indicate the file of interest.
+ * Note that fstat() can be emulated by setting dfd to the fd of interest,
+ * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
*/
SYSCALL_DEFINE5(statx,
int, dfd, const char __user *, filename, unsigned, flags,
@@ -568,16 +569,98 @@ SYSCALL_DEFINE5(statx,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- if (filename)
- error = vfs_statx(dfd, filename, flags, &stat, mask);
- else
- error = vfs_statx_fd(dfd, &stat, mask, flags);
+ error = vfs_statx(dfd, filename, flags, &stat, mask);
if (error)
return error;
return cp_statx(&stat, buffer);
}
+#ifdef CONFIG_COMPAT
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+ struct compat_stat tmp;
+
+ if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
+
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_ino = stat->ino;
+ if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+ return -EOVERFLOW;
+ tmp.st_mode = stat->mode;
+ tmp.st_nlink = stat->nlink;
+ if (tmp.st_nlink != stat->nlink)
+ return -EOVERFLOW;
+ SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+ tmp.st_rdev = old_encode_dev(stat->rdev);
+ if ((u64) stat->size > MAX_NON_LFS)
+ return -EOVERFLOW;
+ tmp.st_size = stat->size;
+ tmp.st_atime = stat->atime.tv_sec;
+ tmp.st_atime_nsec = stat->atime.tv_nsec;
+ tmp.st_mtime = stat->mtime.tv_sec;
+ tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+ tmp.st_ctime = stat->ctime.tv_sec;
+ tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+ tmp.st_blocks = stat->blocks;
+ tmp.st_blksize = stat->blksize;
+ return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
+ struct compat_stat __user *, statbuf)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_stat(filename, &stat);
+ if (error)
+ return error;
+ return cp_compat_stat(&stat, statbuf);
+}
+
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+ struct compat_stat __user *, statbuf)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_lstat(filename, &stat);
+ if (error)
+ return error;
+ return cp_compat_stat(&stat, statbuf);
+}
+
+#ifndef __ARCH_WANT_STAT64
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
+ const char __user *, filename,
+ struct compat_stat __user *, statbuf, int, flag)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_fstatat(dfd, filename, &stat, flag);
+ if (error)
+ return error;
+ return cp_compat_stat(&stat, statbuf);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
+ struct compat_stat __user *, statbuf)
+{
+ struct kstat stat;
+ int error = vfs_fstat(fd, &stat);
+
+ if (!error)
+ error = cp_compat_stat(&stat, statbuf);
+ return error;
+}
+#endif
+
/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
diff --git a/fs/statfs.c b/fs/statfs.c
index 13ae259d4879..4e4623c7a126 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,6 +7,7 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include "internal.h"
static int flags_by_mnt(int mnt_flags)
@@ -239,3 +240,142 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
}
+
+#ifdef CONFIG_COMPAT
+static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
+{
+ if (sizeof ubuf->f_blocks == 4) {
+ if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+ kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+ /* f_files and f_ffree may be -1; it's okay
+ * to stuff that into 32 bits */
+ if (kbuf->f_files != 0xffffffffffffffffULL
+ && (kbuf->f_files & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ if (kbuf->f_ffree != 0xffffffffffffffffULL
+ && (kbuf->f_ffree & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ }
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+ __put_user(kbuf->f_type, &ubuf->f_type) ||
+ __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+ __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+ __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+ __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+ __put_user(kbuf->f_files, &ubuf->f_files) ||
+ __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+ __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+ __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+ __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+ __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The following statfs calls are copies of code from fs/statfs.c and
+ * should be checked against those from time to time
+ */
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
+{
+ struct kstatfs tmp;
+ int error = user_statfs(pathname, &tmp);
+ if (!error)
+ error = put_compat_statfs(buf, &tmp);
+ return error;
+}
+
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
+{
+ struct kstatfs tmp;
+ int error = fd_statfs(fd, &tmp);
+ if (!error)
+ error = put_compat_statfs(buf, &tmp);
+ return error;
+}
+
+static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
+{
+ if (sizeof(ubuf->f_bsize) == 4) {
+ if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
+ kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+ /* f_files and f_ffree may be -1; it's okay
+ * to stuff that into 32 bits */
+ if (kbuf->f_files != 0xffffffffffffffffULL
+ && (kbuf->f_files & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ if (kbuf->f_ffree != 0xffffffffffffffffULL
+ && (kbuf->f_ffree & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ }
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+ __put_user(kbuf->f_type, &ubuf->f_type) ||
+ __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+ __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+ __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+ __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+ __put_user(kbuf->f_files, &ubuf->f_files) ||
+ __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+ __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+ __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+ __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+ __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+ return -EFAULT;
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+ struct kstatfs tmp;
+ int error;
+
+ if (sz != sizeof(*buf))
+ return -EINVAL;
+
+ error = user_statfs(pathname, &tmp);
+ if (!error)
+ error = put_compat_statfs64(buf, &tmp);
+ return error;
+}
+
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+ struct kstatfs tmp;
+ int error;
+
+ if (sz != sizeof(*buf))
+ return -EINVAL;
+
+ error = fd_statfs(fd, &tmp);
+ if (!error)
+ error = put_compat_statfs64(buf, &tmp);
+ return error;
+}
+
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
+{
+ struct compat_ustat tmp;
+ struct kstatfs sbuf;
+ int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+ if (err)
+ return err;
+
+ memset(&tmp, 0, sizeof(struct compat_ustat));
+ tmp.f_tfree = sbuf.f_bfree;
+ tmp.f_tinode = sbuf.f_ffree;
+ if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+ return -EFAULT;
+ return 0;
+}
+#endif
diff --git a/fs/super.c b/fs/super.c
index b8b6a086c03b..adb0c0de428c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb)
hlist_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
+ if (sb->s_bdi != &noop_backing_dev_info) {
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+ }
}
EXPORT_SYMBOL(generic_shutdown_super);
@@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
@@ -1256,6 +1256,49 @@ out:
}
/*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+ struct backing_dev_info *bdi;
+ int err;
+ va_list args;
+
+ bdi = bdi_alloc(GFP_KERNEL);
+ if (!bdi)
+ return -ENOMEM;
+
+ bdi->name = sb->s_type->name;
+
+ va_start(args, fmt);
+ err = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ if (err) {
+ bdi_put(bdi);
+ return err;
+ }
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi;
+
+ return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+ return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+ atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
+/*
* This is an internal function, please use sb_end_{write,pagefault,intwrite}
* instead.
*/
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 21d36d284735..328e89c2cf83 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = {
static int trace_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr trace_files[] = {{""}};
+ static const struct tree_descr trace_files[] = {{""}};
struct tracefs_fs_info *fsi;
int err;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 1e712a364680..718b749fa11a 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,7 @@
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/random.h>
+#include <linux/ctype.h>
#include "ubifs.h"
static DEFINE_SPINLOCK(dbg_lock);
@@ -286,8 +287,10 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
break;
}
- pr_err("\t%d: %s (%s)\n",
- count++, dent->name, get_dent_type(dent->type));
+ pr_err("\t%d: inode %llu, type %s, len %d\n",
+ count++, (unsigned long long) le64_to_cpu(dent->inum),
+ get_dent_type(dent->type),
+ le16_to_cpu(dent->nlen));
fname_name(&nm) = dent->name;
fname_len(&nm) = le16_to_cpu(dent->nlen);
@@ -464,7 +467,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
pr_err("(bad name length, not printing, bad or corrupted node)");
else {
for (i = 0; i < nlen && dent->name[i]; i++)
- pr_cont("%c", dent->name[i]);
+ pr_cont("%c", isprint(dent->name[i]) ?
+ dent->name[i] : '?');
}
pr_cont("\n");
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 30825d882aa9..566079d9b402 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -121,7 +121,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
- ubifs_current_time(inode);
+ current_time(inode);
inode->i_mapping->nrpages = 0;
switch (mode & S_IFMT) {
@@ -285,6 +285,15 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
goto out_dent;
}
+ if (ubifs_crypt_is_encrypted(dir) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+ !fscrypt_has_permitted_context(dir, inode)) {
+ ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
+ dir->i_ino, inode->i_ino);
+ err = -EPERM;
+ goto out_inode;
+ }
+
done:
kfree(dent);
fscrypt_free_filename(&nm);
@@ -295,6 +304,8 @@ done:
d_add(dentry, inode);
return NULL;
+out_inode:
+ iput(inode);
out_dent:
kfree(dent);
out_fname:
@@ -606,8 +617,8 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
while (1) {
- dbg_gen("feed '%s', ino %llu, new f_pos %#x",
- dent->name, (unsigned long long)le64_to_cpu(dent->inum),
+ dbg_gen("ino %llu, new f_pos %#x",
+ (unsigned long long)le64_to_cpu(dent->inum),
key_hash_flash(c, &dent->key));
ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
ubifs_inode(dir)->creat_sqnum);
@@ -748,9 +759,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
goto out_fname;
lock_2_inodes(dir, inode);
+
+ /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */
+ if (inode->i_nlink == 0)
+ ubifs_delete_orphan(c, inode->i_ino);
+
inc_nlink(inode);
ihold(inode);
- inode->i_ctime = ubifs_current_time(inode);
+ inode->i_ctime = current_time(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
@@ -768,6 +784,8 @@ out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ubifs_add_orphan(c, inode->i_ino);
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
iput(inode);
@@ -823,7 +841,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = ubifs_current_time(dir);
+ inode->i_ctime = current_time(dir);
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
@@ -927,7 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = ubifs_current_time(dir);
+ inode->i_ctime = current_time(dir);
clear_nlink(inode);
drop_nlink(dir);
dir->i_size -= sz_change;
@@ -1068,8 +1086,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
}
err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
- if (err)
+ if (err) {
+ kfree(dev);
goto out_budg;
+ }
sz_change = CALC_DENT_SIZE(fname_len(&nm));
@@ -1316,9 +1336,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned int uninitialized_var(saved_nlink);
struct fscrypt_name old_nm, new_nm;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
@@ -1405,7 +1422,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the @i_ctime for inodes on a
* rename.
*/
- time = ubifs_current_time(old_dir);
+ time = current_time(old_dir);
old_inode->i_ctime = time;
/* We must adjust parent link count when renaming directories */
@@ -1578,7 +1595,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
lock_4_inodes(old_dir, new_dir, NULL, NULL);
- time = ubifs_current_time(old_dir);
+ time = current_time(old_dir);
fst_inode->i_ctime = time;
snd_inode->i_ctime = time;
old_dir->i_mtime = old_dir->i_ctime = time;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d9ae86f96df7..2cda3d67e2d0 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1196,7 +1196,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1243,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1420,7 +1420,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
*/
static int update_mctime(struct inode *inode)
{
- struct timespec now = ubifs_current_time(inode);
+ struct timespec now = current_time(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1434,7 +1434,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1511,7 +1511,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ubifs_info *c = inode->i_sb->s_fs_info;
- struct timespec now = ubifs_current_time(inode);
+ struct timespec now = current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
int err, update_time;
@@ -1579,7 +1579,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index da519ba205f6..12b9eb5005ff 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -126,7 +126,7 @@ static int setflags(struct inode *inode, int flags)
ui->flags = ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
- inode->i_ctime = ubifs_current_time(inode);
+ inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 8ece6ca58c0b..caf83d68fb38 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -225,16 +225,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
}
/**
- * ubifs_current_time - round current time to time granularity.
- * @inode: inode
- */
-static inline struct timespec ubifs_current_time(struct inode *inode)
-{
- return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
- current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
-/**
* ubifs_tnc_lookup - look up a file-system node.
* @c: UBIFS file-system description object
* @key: node key to lookup
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 7f1ead29e727..8c25081a5109 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -84,6 +84,8 @@ static int create_default_filesystem(struct ubifs_info *c)
int min_leb_cnt = UBIFS_MIN_LEB_CNT;
long long tmp64, main_bytes;
__le64 tmp_le64;
+ __le32 tmp_le32;
+ struct timespec ts;
/* Some functions called from here depend on the @c->key_len filed */
c->key_len = UBIFS_SK_LEN;
@@ -298,13 +300,17 @@ static int create_default_filesystem(struct ubifs_info *c)
ino->ch.node_type = UBIFS_INO_NODE;
ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
ino->nlink = cpu_to_le32(2);
- tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+
+ ktime_get_real_ts(&ts);
+ ts = timespec_trunc(ts, DEFAULT_TIME_GRAN);
+ tmp_le64 = cpu_to_le64(ts.tv_sec);
ino->atime_sec = tmp_le64;
ino->ctime_sec = tmp_le64;
ino->mtime_sec = tmp_le64;
- ino->atime_nsec = 0;
- ino->ctime_nsec = 0;
- ino->mtime_nsec = 0;
+ tmp_le32 = cpu_to_le32(ts.tv_nsec);
+ ino->atime_nsec = tmp_le32;
+ ino->ctime_nsec = tmp_le32;
+ ino->mtime_nsec = tmp_le32;
ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b73811bd7676..cf4cc99b75b5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb)
}
ubifs_umount(c);
- bdi_destroy(&c->bdi);
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
}
@@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
+ err = ubifs_parse_options(c, data, 0);
+ if (err)
+ goto out_close;
+
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
* UBIFS, I/O is not deferred, it is done immediately in readpage,
* which means the user would have to wait not just for their own I/O
* but the read-ahead I/O as well i.e. completely pointless.
*
- * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+ * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also
+ * @sb->s_bdi->capabilities are initialized to 0 so there won't be any
+ * writeback happening.
*/
- c->bdi.name = "ubifs",
- c->bdi.capabilities = 0;
- err = bdi_init(&c->bdi);
+ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num,
+ c->vi.vol_id);
if (err)
goto out_close;
- err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
- c->vi.ubi_num, c->vi.vol_id);
- if (err)
- goto out_bdi;
-
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_bdi;
- sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
@@ -2080,8 +2075,6 @@ out_umount:
ubifs_umount(c);
out_unlock:
mutex_unlock(&c->umount_mutex);
-out_bdi:
- bdi_destroy(&c->bdi);
out_close:
ubi_close_volume(c->ubi);
out:
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4d57e488038e..4da10a6d702a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -972,7 +972,6 @@ struct ubifs_debug_info;
* struct ubifs_info - UBIFS file-system description data structure
* (per-superblock).
* @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable read-ahead
*
* @highest_inum: highest used inode number
* @max_sqnum: current global sequence number
@@ -1220,7 +1219,6 @@ struct ubifs_debug_info;
*/
struct ubifs_info {
struct super_block *vfs_sb;
- struct backing_dev_info bdi;
ino_t highest_inum;
unsigned long long max_sqnum;
@@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations;
extern const struct file_operations ubifs_dir_operations;
extern const struct inode_operations ubifs_dir_inode_operations;
extern const struct inode_operations ubifs_symlink_inode_operations;
-extern struct backing_dev_info ubifs_backing_dev_info;
extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
/* io.c */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index efe00fcb8b75..3e53fdbf7997 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -152,7 +152,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -234,7 +234,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -488,7 +488,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
return err;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e04cc0cdca9d..f5eb2d5b3bac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -44,12 +44,12 @@ static void __udf_adinicb_readpage(struct page *page)
char *kaddr;
struct udf_inode_info *iinfo = UDF_I(inode);
- kaddr = kmap(page);
+ kaddr = kmap_atomic(page);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size);
flush_dcache_page(page);
SetPageUptodate(page);
- kunmap(page);
+ kunmap_atomic(kaddr);
}
static int udf_adinicb_readpage(struct file *file, struct page *page)
@@ -70,11 +70,11 @@ static int udf_adinicb_writepage(struct page *page,
BUG_ON(!PageLocked(page));
- kaddr = kmap(page);
+ kaddr = kmap_atomic(page);
memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size);
- mark_inode_dirty(inode);
SetPageUptodate(page);
- kunmap(page);
+ kunmap_atomic(kaddr);
+ mark_inode_dirty(inode);
unlock_page(page);
return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a8d8f71ef8bd..98c510e17203 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -276,14 +276,14 @@ int udf_expand_file_adinicb(struct inode *inode)
return -ENOMEM;
if (!PageUptodate(page)) {
- kaddr = kmap(page);
+ kaddr = kmap_atomic(page);
memset(kaddr + iinfo->i_lenAlloc, 0x00,
PAGE_SIZE - iinfo->i_lenAlloc);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
iinfo->i_lenAlloc);
flush_dcache_page(page);
SetPageUptodate(page);
- kunmap(page);
+ kunmap_atomic(kaddr);
}
down_write(&iinfo->i_data_sem);
memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
@@ -300,11 +300,11 @@ int udf_expand_file_adinicb(struct inode *inode)
if (err) {
/* Restore everything back so that we don't lose data... */
lock_page(page);
- kaddr = kmap(page);
down_write(&iinfo->i_data_sem);
+ kaddr = kmap_atomic(page);
memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
inode->i_size);
- kunmap(page);
+ kunmap_atomic(kaddr);
unlock_page(page);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
inode->i_data.a_ops = &udf_adinicb_aops;
@@ -1535,7 +1535,7 @@ reread:
inode->i_data.a_ops = &udf_symlink_aops;
inode->i_op = &udf_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_mode = S_IFLNK | 0777;
break;
case ICBTAG_FILE_TYPE_MAIN:
udf_debug("METADATA FILE-----\n");
@@ -1591,9 +1591,9 @@ static umode_t udf_convert_permissions(struct fileEntry *fe)
permissions = le32_to_cpu(fe->permissions);
flags = le16_to_cpu(fe->icbTag.flags);
- mode = ((permissions) & S_IRWXO) |
- ((permissions >> 2) & S_IRWXG) |
- ((permissions >> 4) & S_IRWXU) |
+ mode = ((permissions) & 0007) |
+ ((permissions >> 2) & 0070) |
+ ((permissions >> 4) & 0700) |
((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) |
((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) |
((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0);
@@ -1669,9 +1669,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
else
fe->gid = cpu_to_le32(i_gid_read(inode));
- udfperms = ((inode->i_mode & S_IRWXO)) |
- ((inode->i_mode & S_IRWXG) << 2) |
- ((inode->i_mode & S_IRWXU) << 4);
+ udfperms = ((inode->i_mode & 0007)) |
+ ((inode->i_mode & 0070) << 2) |
+ ((inode->i_mode & 0700) << 4);
udfperms |= (le32_to_cpu(fe->permissions) &
(FE_PERM_O_DELETE | FE_PERM_O_CHATTR |
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index babf48d0e553..385ee89d5824 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -906,7 +906,7 @@ out:
static int udf_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
struct pathComponent *pc;
const char *compstart;
struct extent_position epos = {};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 9774555b3721..d1dd8cc33179 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -176,6 +176,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
struct inode * inode;
+ struct timespec64 ts;
unsigned cg, bit, i, j, start;
struct ufs_inode_info *ufsi;
int err = -ENOSPC;
@@ -323,8 +324,9 @@ cg_found:
lock_buffer(bh);
ufs2_inode = (struct ufs2_inode *)bh->b_data;
ufs2_inode += ufs_inotofsbo(inode->i_ino);
- ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec);
- ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec);
+ ktime_get_real_ts64(&ts);
+ ufs2_inode->ui_birthtime = cpu_to_fs64(sb, ts.tv_sec);
+ ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
mark_buffer_dirty(bh);
unlock_buffer(bh);
if (sb->s_flags & MS_SYNCHRONOUS)
diff --git a/fs/utimes.c b/fs/utimes.c
index 32b15b3f6629..6571d8c848a0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -1,14 +1,10 @@
-#include <linux/compiler.h>
#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/linkage.h>
#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/sched.h>
-#include <linux/stat.h>
#include <linux/utime.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <asm/unistd.h>
#ifdef __ARCH_WANT_SYS_UTIME
@@ -219,3 +215,63 @@ SYSCALL_DEFINE2(utimes, char __user *, filename,
{
return sys_futimesat(AT_FDCWD, filename, utimes);
}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Not all architectures have sys_utime, so implement this in terms
+ * of sys_utimes.
+ */
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
+ struct compat_utimbuf __user *, t)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (get_user(tv[0].tv_sec, &t->actime) ||
+ get_user(tv[1].tv_sec, &t->modtime))
+ return -EFAULT;
+ tv[0].tv_nsec = 0;
+ tv[1].tv_nsec = 0;
+ }
+ return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (compat_get_timespec(&tv[0], &t[0]) ||
+ compat_get_timespec(&tv[1], &t[1]))
+ return -EFAULT;
+
+ if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
+ return 0;
+ }
+ return do_utimes(dfd, filename, t ? tv : NULL, flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
+ get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
+ get_user(tv[1].tv_sec, &t[1].tv_sec) ||
+ get_user(tv[1].tv_nsec, &t[1].tv_usec))
+ return -EFAULT;
+ if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
+ tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
+ return -EINVAL;
+ tv[0].tv_nsec *= 1000;
+ tv[1].tv_nsec *= 1000;
+ }
+ return do_utimes(dfd, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
+{
+ return compat_sys_futimesat(AT_FDCWD, filename, t);
+}
+#endif
diff --git a/fs/xattr.c b/fs/xattr.c
index 7e3317cf4045..464c94bf65f9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!kvalue) {
- kvalue = vmalloc(size);
- if (!kvalue)
- return -ENOMEM;
- }
+ kvalue = kvmalloc(size, GFP_KERNEL);
+ if (!kvalue)
+ return -ENOMEM;
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
goto out;
@@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
size = XATTR_SIZE_MAX;
- kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!kvalue) {
- kvalue = vmalloc(size);
- if (!kvalue)
- return -ENOMEM;
- }
+ kvalue = kvzalloc(size, GFP_KERNEL);
+ if (!kvalue)
+ return -ENOMEM;
}
error = vfs_getxattr(d, kname, kvalue, size);
@@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
- klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
- if (!klist) {
- klist = vmalloc(size);
- if (!klist)
- return -ENOMEM;
- }
+ klist = kvmalloc(size, GFP_KERNEL);
+ if (!klist)
+ return -ENOMEM;
}
error = vfs_listxattr(d, klist, size);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 26ef1958b65b..5c90f82b8f6b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -79,6 +79,7 @@ xfs-y += xfs_aops.o \
xfs_extent_busy.o \
xfs_file.o \
xfs_filestream.o \
+ xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
xfs_icache.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 70a5b55e0870..393b6849aeb3 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
void *
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
- unsigned noio_flag = 0;
+ unsigned nofs_flag = 0;
void *ptr;
gfp_t lflags;
@@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
* __vmalloc() will allocate data pages and auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
* here. Hence we need to tell memory reclaim that we are in such a
- * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
* the filesystem here and potentially deadlocking.
*/
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- noio_flag = memalloc_noio_save();
+ if (flags & KM_NOFS)
+ nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
- ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL);
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- memalloc_noio_restore(noio_flag);
+ if (flags & KM_NOFS)
+ memalloc_nofs_restore(nofs_flag);
return ptr;
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f0fc84fcaac2..d6ea520162b2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
lflags = GFP_ATOMIC | __GFP_NOWARN;
} else {
lflags = GFP_KERNEL | __GFP_NOWARN;
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ if (flags & KM_NOFS)
lflags &= ~__GFP_FS;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 369adcc18c02..7486401ccbd3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2868,3 +2868,60 @@ err:
xfs_trans_brelse(tp, agbp);
return error;
}
+
+struct xfs_alloc_query_range_info {
+ xfs_alloc_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_alloc_query_range_helper(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info *query = priv;
+ struct xfs_alloc_rec_incore irec;
+
+ irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+ irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all free space within a given range of blocks. */
+int
+xfs_alloc_query_range(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *low_rec,
+ struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ low_brec.a = *low_rec;
+ high_brec.a = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_alloc_query_range_helper, &query);
+}
+
+/* Find all free space records. */
+int
+xfs_alloc_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2a8d0fa6fbbe..77d9c27330ab 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -219,4 +219,16 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+typedef int (*xfs_alloc_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv);
+
+int xfs_alloc_query_range(struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *low_rec,
+ struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn, void *priv);
+int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
+ void *priv);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index efb467b10a71..e1fcfe7f0a9a 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec(
union xfs_btree_key *key,
union xfs_btree_rec *rec)
{
- ASSERT(rec->alloc.ar_startblock != 0);
-
key->alloc.ar_startblock = rec->alloc.ar_startblock;
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
}
STATIC void
+xfs_bnobt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->alloc.ar_startblock);
+ x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
+ key->alloc.ar_startblock = cpu_to_be32(x);
+ key->alloc.ar_blockcount = 0;
+}
+
+STATIC void
+xfs_cntbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+ key->alloc.ar_startblock = 0;
+}
+
+STATIC void
xfs_allocbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
{
- ASSERT(cur->bc_rec.a.ar_startblock != 0);
-
rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
}
@@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur(
}
STATIC __int64_t
-xfs_allocbt_key_diff(
+xfs_bnobt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
xfs_alloc_key_t *kp = &key->alloc;
- __int64_t diff;
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return (__int64_t)be32_to_cpu(kp->ar_startblock) -
- rec->ar_startblock;
- }
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC __int64_t
+xfs_cntbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
+ xfs_alloc_key_t *kp = &key->alloc;
+ __int64_t diff;
diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
if (diff)
@@ -256,6 +280,33 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
+STATIC __int64_t
+xfs_bnobt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC __int64_t
+xfs_cntbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ __int64_t diff;
+
+ diff = be32_to_cpu(k1->alloc.ar_blockcount) -
+ be32_to_cpu(k2->alloc.ar_blockcount);
+ if (diff)
+ return diff;
+
+ return be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
static bool
xfs_allocbt_verify(
struct xfs_buf *bp)
@@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
-xfs_allocbt_keys_inorder(
+xfs_bnobt_keys_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return be32_to_cpu(k1->alloc.ar_startblock) <
- be32_to_cpu(k2->alloc.ar_startblock);
- } else {
- return be32_to_cpu(k1->alloc.ar_blockcount) <
- be32_to_cpu(k2->alloc.ar_blockcount) ||
- (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
- be32_to_cpu(k1->alloc.ar_startblock) <
- be32_to_cpu(k2->alloc.ar_startblock));
- }
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
}
STATIC int
-xfs_allocbt_recs_inorder(
+xfs_bnobt_recs_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_rec *r1,
union xfs_btree_rec *r2)
{
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return be32_to_cpu(r1->alloc.ar_startblock) +
- be32_to_cpu(r1->alloc.ar_blockcount) <=
- be32_to_cpu(r2->alloc.ar_startblock);
- } else {
- return be32_to_cpu(r1->alloc.ar_blockcount) <
- be32_to_cpu(r2->alloc.ar_blockcount) ||
- (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
- be32_to_cpu(r1->alloc.ar_startblock) <
- be32_to_cpu(r2->alloc.ar_startblock));
- }
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_cntbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
}
-#endif /* DEBUG */
-static const struct xfs_btree_ops xfs_allocbt_ops = {
+STATIC int
+xfs_cntbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_bnobt_ops = {
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
@@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_allocbt_key_diff,
+ .key_diff = xfs_bnobt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
+ .diff_two_keys = xfs_bnobt_diff_two_keys,
#if defined(DEBUG) || defined(XFS_WARN)
- .keys_inorder = xfs_allocbt_keys_inorder,
- .recs_inorder = xfs_allocbt_recs_inorder,
+ .keys_inorder = xfs_bnobt_keys_inorder,
+ .recs_inorder = xfs_bnobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_cntbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_cntbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
+ .diff_two_keys = xfs_cntbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_cntbt_keys_inorder,
+ .recs_inorder = xfs_cntbt_recs_inorder,
#endif
};
@@ -427,16 +514,15 @@ xfs_allocbt_init_cursor(
cur->bc_mp = mp;
cur->bc_btnum = btnum;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_allocbt_ops;
- if (btnum == XFS_BTNUM_BNO)
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- else
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+ cur->bc_ops = &xfs_cntbt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
} else {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ cur->bc_ops = &xfs_bnobt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9bd104f32908..f02eb7673392 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree(
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
} else if (dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
args.fsbno = *firstblock;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -779,20 +778,6 @@ try_another_ag:
return error;
}
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- goto try_another_ag;
- }
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
xfs_iroot_realloc(ip, -1, whichfork);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -925,7 +910,6 @@ xfs_bmap_local_to_extents(
* file currently fits in an inode.
*/
if (*firstblock == NULLFSBLOCK) {
-try_another_ag:
args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -938,19 +922,6 @@ try_another_ag:
if (error)
goto done;
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- goto try_another_ag;
- }
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
@@ -1260,7 +1231,6 @@ xfs_bmap_read_extents(
xfs_fsblock_t bno; /* block # of "block" */
xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
- xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
xfs_extnum_t i, j; /* index into the extents list */
xfs_ifork_t *ifp; /* fork structure */
int level; /* btree level, for checking */
@@ -1271,8 +1241,6 @@ xfs_bmap_read_extents(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
- XFS_EXTFMT_INODE(ip);
block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
@@ -1340,18 +1308,9 @@ xfs_bmap_read_extents(
xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
trp->l0 = be64_to_cpu(frp->l0);
trp->l1 = be64_to_cpu(frp->l1);
- }
- if (exntf == XFS_EXTFMT_NOSTATE) {
- /*
- * Check all attribute bmap btree records and
- * any "older" data bmap btree records for a
- * set bit in the "extent flag" position.
- */
- if (unlikely(xfs_check_nostate_extents(ifp,
- start, num_recs))) {
+ if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) {
XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
+ XFS_ERRLEVEL_LOW, mp);
goto error0;
}
}
@@ -2879,27 +2838,30 @@ xfs_bmap_add_extent_hole_delay(
*/
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
- struct xfs_bmalloca *bma,
- int whichfork)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_extnum_t *idx,
+ struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new,
+ xfs_fsblock_t *first,
+ struct xfs_defer_ops *dfops,
+ int *logflagsp)
{
- struct xfs_bmbt_irec *new = &bma->got;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur = *curp;
int error; /* error return value */
int i; /* temp state */
- xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
- struct xfs_mount *mp;
- mp = bma->ip->i_mount;
- ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-
- ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= xfs_iext_count(ifp));
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!bma->cur ||
- !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2912,9 +2874,9 @@ xfs_bmap_add_extent_hole_real(
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (bma->idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2923,9 +2885,9 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (bma->idx < xfs_iext_count(ifp)) {
+ if (*idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2962,36 +2924,36 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- --bma->idx;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
- XFS_IFORK_NEXT_SET(bma->ip, whichfork,
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
- if (bma->cur == NULL) {
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+ error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
right.br_startblock, right.br_blockcount,
&i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_btree_delete(bma->cur, &i);
+ error = xfs_btree_delete(cur, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_btree_decrement(bma->cur, 0, &i);
+ error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount +
@@ -3008,23 +2970,23 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --bma->idx;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (bma->cur == NULL) {
+ if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+ error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
left.br_startblock, left.br_blockcount,
&i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount,
@@ -3040,25 +3002,25 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (bma->cur == NULL) {
+ if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur,
+ error = xfs_bmbt_lookup_eq(cur,
right.br_startoff,
right.br_startblock,
right.br_blockcount, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, new->br_startoff,
+ error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
right.br_blockcount,
@@ -3074,22 +3036,22 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
- XFS_IFORK_NEXT_SET(bma->ip, whichfork,
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
- if (bma->cur == NULL) {
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur,
+ error = xfs_bmbt_lookup_eq(cur,
new->br_startoff,
new->br_startblock,
new->br_blockcount, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = new->br_state;
- error = xfs_btree_insert(bma->cur, &i);
+ cur->bc_rec.b.br_state = new->br_state;
+ error = xfs_btree_insert(cur, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -3098,30 +3060,30 @@ xfs_bmap_add_extent_hole_real(
}
/* add reverse mapping */
- error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
if (error)
goto done;
/* convert to a btree if necessary */
- if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
- ASSERT(bma->cur == NULL);
- error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->dfops, &bma->cur,
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp,
0, &tmp_logflags, whichfork);
- bma->logflags |= tmp_logflags;
+ *logflagsp |= tmp_logflags;
+ cur = *curp;
if (error)
goto done;
}
/* clear out the allocated field, done with it now in any case. */
- if (bma->cur)
- bma->cur->bc_private.b.allocated = 0;
+ if (cur)
+ cur->bc_private.b.allocated = 0;
- xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+ xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
- bma->logflags |= rval;
+ *logflagsp |= rval;
return error;
}
@@ -3853,60 +3815,6 @@ xfs_bmap_btalloc(
}
/*
- * For a remap operation, just "allocate" an extent at the address that the
- * caller passed in, and ensure that the AGFL is the right size. The caller
- * will then map the "allocated" extent into the file somewhere.
- */
-STATIC int
-xfs_bmap_remap_alloc(
- struct xfs_bmalloca *ap)
-{
- struct xfs_trans *tp = ap->tp;
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agblock_t bno;
- struct xfs_alloc_arg args;
- int error;
-
- /*
- * validate that the block number is legal - the enables us to detect
- * and handle a silent filesystem corruption rather than crashing.
- */
- memset(&args, 0, sizeof(struct xfs_alloc_arg));
- args.tp = ap->tp;
- args.mp = ap->tp->t_mountp;
- bno = *ap->firstblock;
- args.agno = XFS_FSB_TO_AGNO(mp, bno);
- args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
- if (args.agno >= mp->m_sb.sb_agcount ||
- args.agbno >= mp->m_sb.sb_agblocks)
- return -EFSCORRUPTED;
-
- /* "Allocate" the extent from the range we passed in. */
- trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
- ap->blkno = bno;
- ap->ip->i_d.di_nblocks += ap->length;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-
- /* Fix the freelist, like a real allocator does. */
- args.datatype = ap->datatype;
- args.pag = xfs_perag_get(args.mp, args.agno);
- ASSERT(args.pag);
-
- /*
- * The freelist fixing code will decline the allocation if
- * the size and shape of the free space doesn't allow for
- * allocating the extent and updating all the metadata that
- * happens during an allocation. We're remapping, not
- * allocating, so skip that check by pretending to be freeing.
- */
- error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
- xfs_perag_put(args.pag);
- if (error)
- trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
- return error;
-}
-
-/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
* It figures out where to ask the underlying allocator to put the new extent.
*/
@@ -3914,8 +3822,6 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- if (ap->flags & XFS_BMAPI_REMAP)
- return xfs_bmap_remap_alloc(ap);
if (XFS_IS_REALTIME_INODE(ap->ip) &&
xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
@@ -4386,7 +4292,9 @@ xfs_bmapi_allocate(
if (bma->wasdel)
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
- error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+ error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+ whichfork, &bma->idx, &bma->cur, &bma->got,
+ bma->firstblock, bma->dfops, &bma->logflags);
bma->logflags |= tmp_logflags;
if (error)
@@ -4549,9 +4457,7 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
- ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
- ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_REMAP));
/* zeroing is for currently only for data extents, not metadata */
ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4635,13 +4541,8 @@ xfs_bmapi_write(
} else {
need_alloc = true;
}
- } else {
- /*
- * Make sure we only reflink into a hole.
- */
- ASSERT(!(flags & XFS_BMAPI_REMAP));
- if (isnullstartblock(bma.got.br_startblock))
- wasdelay = true;
+ } else if (isnullstartblock(bma.got.br_startblock)) {
+ wasdelay = true;
}
/*
@@ -4770,6 +4671,93 @@ error0:
return error;
}
+static int
+xfs_bmapi_remap(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ xfs_fsblock_t startblock,
+ struct xfs_defer_ops *dfops)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_btree_cur *cur = NULL;
+ xfs_fsblock_t firstblock = NULLFSBLOCK;
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t idx;
+ int logflags = 0, error;
+
+ ASSERT(len > 0);
+ ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) {
+ /* make sure we only reflink into a hole. */
+ ASSERT(got.br_startoff > bno);
+ ASSERT(got.br_startoff - bno >= len);
+ }
+
+ ip->i_d.di_nblocks += len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ cur->bc_private.b.firstblock = firstblock;
+ cur->bc_private.b.dfops = dfops;
+ cur->bc_private.b.flags = 0;
+ }
+
+ got.br_startoff = bno;
+ got.br_startblock = startblock;
+ got.br_blockcount = len;
+ got.br_state = XFS_EXT_NORM;
+
+ error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur,
+ &got, &firstblock, dfops, &logflags);
+ if (error)
+ goto error0;
+
+ if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+ int tmp_logflags = 0;
+
+ error = xfs_bmap_btree_to_extents(tp, ip, cur,
+ &tmp_logflags, XFS_DATA_FORK);
+ logflags |= tmp_logflags;
+ }
+
+error0:
+ if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~XFS_ILOG_DEXT;
+ else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ logflags &= ~XFS_ILOG_DBROOT;
+
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (cur) {
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ return error;
+}
+
/*
* When a delalloc extent is split (e.g., due to a hole punch), the original
* indlen reservation must be shared across the two new extents that are left
@@ -4887,7 +4875,7 @@ xfs_bmap_del_extent_delay(
ASSERT(got_endoff >= del_endoff);
if (isrt) {
- int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+ uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
xfs_mod_frextents(mp, rtexts);
@@ -6488,27 +6476,15 @@ xfs_bmap_finish_one(
xfs_filblks_t blockcount,
xfs_exntst_t state)
{
- struct xfs_bmbt_irec bmap;
- int nimaps = 1;
- xfs_fsblock_t firstfsb;
- int flags = XFS_BMAPI_REMAP;
- int done;
- int error = 0;
-
- bmap.br_startblock = startblock;
- bmap.br_startoff = startoff;
- bmap.br_blockcount = blockcount;
- bmap.br_state = state;
+ int error = 0, done;
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
ip->i_ino, whichfork, startoff, blockcount, state);
- if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
return -EFSCORRUPTED;
- if (whichfork == XFS_ATTR_FORK)
- flags |= XFS_BMAPI_ATTRFORK;
if (XFS_TEST_ERROR(false, tp->t_mountp,
XFS_ERRTAG_BMAP_FINISH_ONE,
@@ -6517,16 +6493,12 @@ xfs_bmap_finish_one(
switch (type) {
case XFS_BMAP_MAP:
- firstfsb = bmap.br_startblock;
- error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, &firstfsb,
- bmap.br_blockcount, &bmap, &nimaps,
- dfops);
+ error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
+ startblock, dfops);
break;
case XFS_BMAP_UNMAP:
- error = xfs_bunmapi(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, 1, &firstfsb,
- dfops, &done);
+ error = xfs_bunmapi(tp, ip, startoff, blockcount,
+ XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
ASSERT(done);
break;
default:
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cdef87db5262..c35a14fa1527 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -172,6 +172,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
+ * Return true if the extent is a real, allocated extent, or false if it is a
+ * delayed allocation, and unwritten extent or a hole.
+ */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_state != XFS_EXT_UNWRITTEN &&
+ irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
+
+/*
* This macro is used to determine how many extents will be shifted
* in one write transaction. We could require two splits,
* an extent move on the first and an extent merge on the second,
@@ -232,8 +244,6 @@ int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *del);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
-int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
- xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fd55db479385..6cba69aff077 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr(
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
-/*
- * Check extent records, which have just been read, for
- * any bit in the extent flag field. ASSERT on debug
- * kernels, as this condition should not occur.
- * Return an error condition (1) if any flags found,
- * otherwise return 0.
- */
-
-int
-xfs_check_nostate_extents(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- xfs_extnum_t num)
-{
- for (; num > 0; num--, idx++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
- if ((ep->l0 >>
- (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
- ASSERT(0);
- return 1;
- }
- }
- return 0;
-}
-
-
STATIC struct xfs_btree_cur *
xfs_bmbt_dup_cursor(
struct xfs_btree_cur *cur)
@@ -448,7 +422,6 @@ xfs_bmbt_alloc_block(
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
/*
* Make sure there is sufficient room left in the AG to
* complete a full tree split for an extent insert. If
@@ -477,22 +450,6 @@ try_another_ag:
if (error)
goto error0;
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- args.fsbno = cur->bc_private.b.firstblock;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- goto try_another_ag;
- }
-
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
* Could not find an AG with enough free space to satisfy
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..9da5a8d4f184 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -25,14 +25,6 @@ struct xfs_inode;
struct xfs_trans;
/*
- * Extent state and extent format macros.
- */
-#define XFS_EXTFMT_INODE(x) \
- (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
- XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
-#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
-
-/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_BMBT_BLOCK_LEN(mp) \
@@ -140,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+/*
+ * Check that the extent does not contain an invalid unwritten extent flag.
+ */
+static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
+ struct xfs_bmbt_rec_host *ep)
+{
+ if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
+ return true;
+ if (whichfork == XFS_DATA_FORK &&
+ xfs_sb_version_hasextflgbit(&mp->m_sb))
+ return true;
+ return false;
+}
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..5392674bf893 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_FSTRANS;
+ unsigned long new_pflags = PF_MEMALLOC_NOFS;
/*
* we are in a transaction context here, but may also be doing work
@@ -4842,6 +4842,21 @@ xfs_btree_query_range(
fn, priv);
}
+/* Query a btree for all records. */
+int
+xfs_btree_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_rec;
+ union xfs_btree_irec high_rec;
+
+ memset(&low_rec, 0, sizeof(low_rec));
+ memset(&high_rec, 0xFF, sizeof(high_rec));
+ return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv);
+}
+
/*
* Calculate the number of blocks needed to store a given number of records
* in a short-format (per-AG metadata) btree.
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4bb62580a7fd..27bed08261c5 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
int xfs_btree_query_range(struct xfs_btree_cur *cur,
union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn, void *priv);
+int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
+ void *priv);
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
void *data);
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index ac9a003dd29a..747085b4ef44 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,13 +35,8 @@ int
xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
- unsigned int ndquots;
-
ASSERT(nbblks > 0);
- ndquots = BBTOB(nbblks);
- do_div(ndquots, sizeof(xfs_dqblk_t));
-
- return ndquots;
+ return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
}
/*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 6b7579e7b60a..a1dccd8d96bc 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version)
/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
* Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
*/
#define XFS_MAXLINK ((1U << 31) - 1U)
-#define XFS_MAXLINK_1 65535U
/*
* Values for di_format
@@ -1578,19 +1576,10 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
}
/*
- * Possible extent formats.
- */
-typedef enum {
- XFS_EXTFMT_NOSTATE = 0,
- XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
* Possible extent states.
*/
typedef enum {
XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
- XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
} xfs_exntst_t;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b72dc821d78b..095bdf049a3f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -92,6 +92,18 @@ struct getbmapx {
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
#define BMV_OF_SHARED 0x8 /* segment shared with another file */
+/* fmr_owner special values for FS_IOC_GETFSMAP */
+#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */
+#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */
+#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */
+#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */
+#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */
+#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */
+#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
+#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */
+#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */
+#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */
+
/*
* Structure for XFS_IOC_FSSETDM.
* For use by backup and restore programs to set the XFS on-disk inode
@@ -502,6 +514,7 @@ typedef struct xfs_swapext
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
+/* XFS_IOC_GETFSMAP ------ hoisted 59 */
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d93f9d918cfc..09c3d1aecef2 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -508,7 +508,7 @@ xfs_iread(
/* even unallocated inodes are verified */
if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
- xfs_alert(mp, "%s: validation failed for inode %lld failed",
+ xfs_alert(mp, "%s: validation failed for inode %lld",
__func__, ip->i_ino);
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8a37efe04de3..0e80f34fe97c 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-#ifdef DEBUG
-/*
- * Make sure that the extents in the given memory buffer
- * are valid.
- */
-void
-xfs_validate_extents(
- xfs_ifork_t *ifp,
- int nrecs,
- xfs_exntfmt_t fmt)
-{
- xfs_bmbt_irec_t irec;
- xfs_bmbt_rec_host_t rec;
- int i;
-
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- rec.l0 = get_unaligned(&ep->l0);
- rec.l1 = get_unaligned(&ep->l1);
- xfs_bmbt_get_all(&rec, &irec);
- if (fmt == XFS_EXTFMT_NOSTATE)
- ASSERT(irec.br_state == XFS_EXT_NORM);
- }
-}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
-
-
/*
* Move inode type and inode format specific information from the
* on-disk inode to the in-core inode. For fifos, devs, and sockets
@@ -352,40 +323,33 @@ xfs_iformat_local(
}
/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it. Either way, set if_extents
- * to point at the extents.
+ * The file consists of a set of extents all of which fit into the on-disk
+ * inode. If there are few enough extents to fit into the if_inline_ext, then
+ * copy them there. Otherwise allocate a buffer for them and copy them into it.
+ * Either way, set if_extents to point at the extents.
*/
STATIC int
xfs_iformat_extents(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork)
{
- xfs_bmbt_rec_t *dp;
- xfs_ifork_t *ifp;
- int nex;
- int size;
- int i;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nex = XFS_DFORK_NEXTENTS(dip, whichfork);
- size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ int size = nex * sizeof(xfs_bmbt_rec_t);
+ struct xfs_bmbt_rec *dp;
+ int i;
/*
- * If the number of extents is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
+ * If the number of extents is unreasonable, then something is wrong and
+ * we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
- if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
(unsigned long long) ip->i_ino, nex);
XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
+ mp, dip);
return -EFSCORRUPTED;
}
@@ -400,22 +364,17 @@ xfs_iformat_extents(
ifp->if_bytes = size;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
- xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
for (i = 0; i < nex; i++, dp++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
ep->l0 = get_unaligned_be64(&dp->l0);
ep->l1 = get_unaligned_be64(&dp->l1);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) {
+ XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+ XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
}
XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
- if (whichfork != XFS_DATA_FORK ||
- XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
- if (unlikely(xfs_check_nostate_extents(
- ifp, 0, nex))) {
- XFS_ERROR_REPORT("xfs_iformat_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
}
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
@@ -518,7 +477,6 @@ xfs_iread_extents(
xfs_iext_destroy(ifp);
return error;
}
- xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
}
@@ -837,6 +795,9 @@ xfs_iextents_copy(
copied = 0;
for (i = 0; i < nrecs; i++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+
+ ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep));
+
start_block = xfs_bmbt_get_startblock(ep);
if (isnullstartblock(start_block)) {
/*
@@ -852,7 +813,6 @@ xfs_iextents_copy(
copied++;
}
ASSERT(copied != 0);
- xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
return (copied * (uint)sizeof(xfs_bmbt_rec_t));
}
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3a8cc7139912..06cfb93c2ef9 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper(
/* Find all rmaps between two keys. */
int
xfs_rmap_query_range(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec,
- struct xfs_rmap_irec *high_rec,
- xfs_rmap_query_range_fn fn,
- void *priv)
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec,
+ struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
{
- union xfs_btree_irec low_brec;
- union xfs_btree_irec high_brec;
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
struct xfs_rmap_query_range_info query;
low_brec.r = *low_rec;
@@ -2019,6 +2019,20 @@ xfs_rmap_query_range(
xfs_rmap_query_range_helper, &query);
}
+/* Find all rmaps. */
+int
+xfs_rmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info query;
+
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
+}
+
/* Clean up after calling xfs_rmap_finish_one. */
void
xfs_rmap_finish_one_cleanup(
@@ -2291,3 +2305,31 @@ xfs_rmap_free_extent(
return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
XFS_DATA_FORK, &bmap);
}
+
+/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
+int
+xfs_rmap_compare(
+ const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b)
+{
+ __u64 oa;
+ __u64 ob;
+
+ oa = xfs_rmap_irec_offset_pack(a);
+ ob = xfs_rmap_irec_offset_pack(b);
+
+ if (a->rm_startblock < b->rm_startblock)
+ return -1;
+ else if (a->rm_startblock > b->rm_startblock)
+ return 1;
+ else if (a->rm_owner < b->rm_owner)
+ return -1;
+ else if (a->rm_owner > b->rm_owner)
+ return 1;
+ else if (oa < ob)
+ return -1;
+ else if (oa > ob)
+ return 1;
+ else
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 789930599339..98f908fea103 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)(
int xfs_rmap_query_range(struct xfs_btree_cur *cur,
struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn, void *priv);
+int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
+ void *priv);
enum xfs_rmap_intent_type {
XFS_RMAP_MAP,
@@ -212,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_compare(const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b);
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index ea45584a9913..e47b99e59f60 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,3 +1016,73 @@ xfs_rtfree_extent(
}
return 0;
}
+
+/* Find all the free records within a given range. */
+int
+xfs_rtalloc_query_range(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *low_rec,
+ struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec rec;
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtblock_t rtstart;
+ xfs_rtblock_t rtend;
+ xfs_rtblock_t rem;
+ int is_free;
+ int error = 0;
+
+ if (low_rec->ar_startblock > high_rec->ar_startblock)
+ return -EINVAL;
+ else if (low_rec->ar_startblock == high_rec->ar_startblock)
+ return 0;
+
+ /* Iterate the bitmap, looking for discrepancies. */
+ rtstart = low_rec->ar_startblock;
+ rem = high_rec->ar_startblock - rtstart;
+ while (rem) {
+ /* Is the first block free? */
+ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ &is_free);
+ if (error)
+ break;
+
+ /* How long does the extent go for? */
+ error = xfs_rtfind_forw(mp, tp, rtstart,
+ high_rec->ar_startblock - 1, &rtend);
+ if (error)
+ break;
+
+ if (is_free) {
+ rec.ar_startblock = rtstart;
+ rec.ar_blockcount = rtend - rtstart + 1;
+
+ error = fn(tp, &rec, priv);
+ if (error)
+ break;
+ }
+
+ rem -= rtend - rtstart + 1;
+ rtstart = rtend + 1;
+ }
+
+ return error;
+}
+
+/* Find all the free records. */
+int
+xfs_rtalloc_query_all(
+ struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec keys[2];
+
+ keys[0].ar_startblock = 0;
+ keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
+ keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+
+ return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 7917f6e44286..d787c677d2a3 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,8 +21,20 @@
/*
* Components of space reservations.
*/
+
+/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
+
+/* Adding one rmap could split every level up to the top of the tree. */
+#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+
+/* Blocks we might need to add "b" rmaps to a tree. */
+#define XFS_NRMAPADD_SPACE_RES(mp, b)\
+ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+ XFS_RMAPADD_SPACE_RES(mp))
+
#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -30,13 +42,12 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
+
+/* Blocks we might need to add "b" mappings & rmappings to a file. */
#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
- (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
- XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
- XFS_EXTENTADD_SPACE_RES(mp,w) + \
- ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
- XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
- (mp)->m_rmap_maxlevels)
+ (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
+ XFS_NRMAPADD_SPACE_RES((mp), (b)))
+
#define XFS_DAENTER_1B(mp,w) \
((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 61494295d92f..09af0f7cd55e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,11 +111,11 @@ xfs_finish_page_writeback(
bsize = bh->b_size;
do {
+ if (off > end)
+ break;
next = bh->b_this_page;
if (off < bvec->bv_offset)
goto next_bh;
- if (off > end)
- break;
bh->b_end_io(bh, !error);
next_bh:
off += bsize;
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return 0;
}
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
goto redirty;
/*
@@ -1261,8 +1261,8 @@ xfs_get_blocks(
if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
- ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
- : XFS_IO_OVERWRITE, &imap);
+ imap.br_state == XFS_EXT_UNWRITTEN ?
+ XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
@@ -1276,9 +1276,7 @@ xfs_get_blocks(
* For unwritten extents do not report a disk address in the buffered
* read case (treat as if we're reading into a hole).
*/
- if (imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(&imap))
+ if (xfs_bmap_is_real_extent(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
/*
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9bf57c76623b..d419d23fa214 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -34,6 +34,8 @@
#include "xfs_bmap.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
kmem_zone_t *xfs_bui_zone;
@@ -215,6 +217,7 @@ void
xfs_bui_release(
struct xfs_bui_log_item *buip)
{
+ ASSERT(atomic_read(&buip->bui_refcount) > 0);
if (atomic_dec_and_test(&buip->bui_refcount)) {
xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_bui_item_free(buip);
@@ -446,7 +449,8 @@ xfs_bui_recover(
return -EIO;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
return error;
budp = xfs_trans_get_bud(tp, buip);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 828532ce0adc..2b954308a1d6 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, true);
+ GFP_NOFS, 0);
}
int
@@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared(
next_map->br_blockcount = 0;
/* Only written data blocks can be shared. */
- if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
- map->br_startblock == DELAYSTARTBLOCK ||
- map->br_startblock == HOLESTARTBLOCK ||
- ISUNWRITTEN(map))
+ if (!xfs_is_reflink_inode(ip) ||
+ whichfork != XFS_DATA_FORK ||
+ !xfs_bmap_is_real_extent(map))
return 0;
agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
@@ -904,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
}
/*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
+ * This is called to free any blocks beyond eof. The caller must hold
+ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
+ * reference to the inode.
*/
int
xfs_free_eofblocks(
@@ -921,8 +920,6 @@ xfs_free_eofblocks(
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-
/*
* Figure out if there are any blocks beyond the end
* of the file. If not, then there is nothing to do.
@@ -1209,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries(
return error;
if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
+ mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize);
if (mod)
*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..62fa39276a24 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
bp->b_addr = NULL;
} else {
int retried = 0;
- unsigned noio_flag;
+ unsigned nofs_flag;
/*
* vm_map_ram() will allocate auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * that we are in such a context via PF_MEMALLOC_NOFS to prevent
* memory reclaim re-entering the filesystem here and
* potentially deadlocking.
*/
- noio_flag = memalloc_noio_save();
+ nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
break;
vm_unmap_aliases();
} while (retried++ <= 1);
- memalloc_noio_restore(noio_flag);
+ memalloc_nofs_restore(nofs_flag);
if (!bp->b_addr)
return -ENOMEM;
@@ -1079,6 +1079,8 @@ void
xfs_buf_unlock(
struct xfs_buf *bp)
{
+ ASSERT(xfs_buf_islocked(bp));
+
XB_CLEAR_OWNER(bp);
up(&bp->b_sema);
@@ -1815,6 +1817,28 @@ error:
}
/*
+ * Cancel a delayed write list.
+ *
+ * Remove each buffer from the list, clear the delwri queue flag and drop the
+ * associated buffer reference.
+ */
+void
+xfs_buf_delwri_cancel(
+ struct list_head *list)
+{
+ struct xfs_buf *bp;
+
+ while (!list_empty(list)) {
+ bp = list_first_entry(list, struct xfs_buf, b_list);
+
+ xfs_buf_lock(bp);
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ }
+}
+
+/*
* Add a buffer to the delayed write list.
*
* This queues a buffer for writeout if it hasn't already been. Note that
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3c867e5a63e1..8d1d44f87ce9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -291,7 +291,6 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
@@ -330,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t);
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index ad9396e516f6..20b7a5c6eb2f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf(
/*
* Do we need more readahead?
+ * Each loop tries to process 1 full dir blk; last may be partial.
*/
blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
@@ -404,7 +405,8 @@ xfs_dir2_leaf_readbuf(
* Read-ahead a contiguous directory block.
*/
if (i > mip->ra_current &&
- map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
+ geo->fsbcount) {
xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
XFS_FSB_TO_DADDR(dp->i_mount,
@@ -425,14 +427,19 @@ xfs_dir2_leaf_readbuf(
}
/*
- * Advance offset through the mapping table.
+ * Advance offset through the mapping table, processing a full
+ * dir block even if it is fragmented into several extents.
+ * But stop if we have consumed all valid mappings, even if
+ * it's not yet a full directory block.
*/
- for (j = 0; j < geo->fsbcount; j += length ) {
+ for (j = 0;
+ j < geo->fsbcount && mip->ra_index < mip->map_valid;
+ j += length ) {
/*
* The rest of this extent but not more than a dir
* block.
*/
- length = min_t(int, geo->fsbcount,
+ length = min_t(int, geo->fsbcount - j,
map[mip->ra_index].br_blockcount -
mip->ra_offset);
mip->ra_offset += length;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d796ffac7296..6a05d278da64 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -132,6 +132,11 @@ next_extent:
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto out_del_cursor;
+
+ if (fatal_signal_pending(current)) {
+ error = -ERESTARTSYS;
+ goto out_del_cursor;
+ }
}
out_del_cursor:
@@ -196,8 +201,11 @@ xfs_ioc_trim(
for (agno = start_agno; agno <= end_agno; agno++) {
error = xfs_trim_extents(mp, agno, start, end, minlen,
&blocks_trimmed);
- if (error)
+ if (error) {
last_error = error;
+ if (error == -ERESTARTSYS)
+ break;
+ }
}
if (last_error)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d7bc14906af8..44f8c5451210 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -290,6 +290,7 @@ void
xfs_efi_release(
struct xfs_efi_log_item *efip)
{
+ ASSERT(atomic_read(&efip->efi_refcount) > 0);
if (atomic_dec_and_test(&efip->efi_refcount)) {
xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
xfs_efi_item_free(efip);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
new file mode 100644
index 000000000000..3683819887a5
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.c
@@ -0,0 +1,940 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rtalloc.h"
+
+/* Convert an xfs_fsmap to an fsmap. */
+void
+xfs_fsmap_from_internal(
+ struct fsmap *dest,
+ struct xfs_fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = BBTOB(src->fmr_physical);
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = BBTOB(src->fmr_offset);
+ dest->fmr_length = BBTOB(src->fmr_length);
+ dest->fmr_reserved[0] = 0;
+ dest->fmr_reserved[1] = 0;
+ dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an xfs_fsmap. */
+void
+xfs_fsmap_to_internal(
+ struct xfs_fsmap *dest,
+ struct fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = BTOBBT(src->fmr_physical);
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = BTOBBT(src->fmr_offset);
+ dest->fmr_length = BTOBBT(src->fmr_length);
+}
+
+/* Convert an fsmap owner into an rmapbt owner. */
+static int
+xfs_fsmap_owner_to_rmap(
+ struct xfs_rmap_irec *dest,
+ struct xfs_fsmap *src)
+{
+ if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
+ dest->rm_owner = src->fmr_owner;
+ return 0;
+ }
+
+ switch (src->fmr_owner) {
+ case 0: /* "lowest owner id possible" */
+ case -1ULL: /* "highest owner id possible" */
+ dest->rm_owner = 0;
+ break;
+ case XFS_FMR_OWN_FREE:
+ dest->rm_owner = XFS_RMAP_OWN_NULL;
+ break;
+ case XFS_FMR_OWN_UNKNOWN:
+ dest->rm_owner = XFS_RMAP_OWN_UNKNOWN;
+ break;
+ case XFS_FMR_OWN_FS:
+ dest->rm_owner = XFS_RMAP_OWN_FS;
+ break;
+ case XFS_FMR_OWN_LOG:
+ dest->rm_owner = XFS_RMAP_OWN_LOG;
+ break;
+ case XFS_FMR_OWN_AG:
+ dest->rm_owner = XFS_RMAP_OWN_AG;
+ break;
+ case XFS_FMR_OWN_INOBT:
+ dest->rm_owner = XFS_RMAP_OWN_INOBT;
+ break;
+ case XFS_FMR_OWN_INODES:
+ dest->rm_owner = XFS_RMAP_OWN_INODES;
+ break;
+ case XFS_FMR_OWN_REFC:
+ dest->rm_owner = XFS_RMAP_OWN_REFC;
+ break;
+ case XFS_FMR_OWN_COW:
+ dest->rm_owner = XFS_RMAP_OWN_COW;
+ break;
+ case XFS_FMR_OWN_DEFECTIVE: /* not implemented */
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Convert an rmapbt owner into an fsmap owner. */
+static int
+xfs_fsmap_owner_from_rmap(
+ struct xfs_fsmap *dest,
+ struct xfs_rmap_irec *src)
+{
+ dest->fmr_flags = 0;
+ if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
+ dest->fmr_owner = src->rm_owner;
+ return 0;
+ }
+ dest->fmr_flags |= FMR_OF_SPECIAL_OWNER;
+
+ switch (src->rm_owner) {
+ case XFS_RMAP_OWN_FS:
+ dest->fmr_owner = XFS_FMR_OWN_FS;
+ break;
+ case XFS_RMAP_OWN_LOG:
+ dest->fmr_owner = XFS_FMR_OWN_LOG;
+ break;
+ case XFS_RMAP_OWN_AG:
+ dest->fmr_owner = XFS_FMR_OWN_AG;
+ break;
+ case XFS_RMAP_OWN_INOBT:
+ dest->fmr_owner = XFS_FMR_OWN_INOBT;
+ break;
+ case XFS_RMAP_OWN_INODES:
+ dest->fmr_owner = XFS_FMR_OWN_INODES;
+ break;
+ case XFS_RMAP_OWN_REFC:
+ dest->fmr_owner = XFS_FMR_OWN_REFC;
+ break;
+ case XFS_RMAP_OWN_COW:
+ dest->fmr_owner = XFS_FMR_OWN_COW;
+ break;
+ case XFS_RMAP_OWN_NULL: /* "free" */
+ dest->fmr_owner = XFS_FMR_OWN_FREE;
+ break;
+ default:
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+/* getfsmap query state */
+struct xfs_getfsmap_info {
+ struct xfs_fsmap_head *head;
+ xfs_fsmap_format_t formatter; /* formatting fn */
+ void *format_arg; /* format buffer */
+ struct xfs_buf *agf_bp; /* AGF, for refcount queries */
+ xfs_daddr_t next_daddr; /* next daddr we expect */
+ u64 missing_owner; /* owner of holes */
+ u32 dev; /* device id */
+ xfs_agnumber_t agno; /* AG number, if applicable */
+ struct xfs_rmap_irec low; /* low rmap key */
+ struct xfs_rmap_irec high; /* high rmap key */
+ bool last; /* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct xfs_getfsmap_dev {
+ u32 dev;
+ int (*fn)(struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info);
+};
+
+/* Compare two getfsmap device handlers. */
+static int
+xfs_getfsmap_dev_compare(
+ const void *p1,
+ const void *p2)
+{
+ const struct xfs_getfsmap_dev *d1 = p1;
+ const struct xfs_getfsmap_dev *d2 = p2;
+
+ return d1->dev - d2->dev;
+}
+
+/* Decide if this mapping is shared. */
+STATIC int
+xfs_getfsmap_is_shared(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_rmap_irec *rec,
+ bool *stat)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error;
+
+ *stat = false;
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+ /* rt files will have agno set to NULLAGNUMBER */
+ if (info->agno == NULLAGNUMBER)
+ return 0;
+
+ /* Are there any shared blocks here? */
+ flen = 0;
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+ info->agno, NULL);
+
+ error = xfs_refcount_find_shared(cur, rec->rm_startblock,
+ rec->rm_blockcount, &fbno, &flen, false);
+
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ *stat = flen > 0;
+ return 0;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+STATIC int
+xfs_getfsmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_rmap_irec *rec,
+ xfs_daddr_t rec_daddr)
+{
+ struct xfs_fsmap fmr;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool shared;
+ int error;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Filter out records that start before our startpoint, if the
+ * caller requested that.
+ */
+ if (xfs_rmap_compare(rec, &info->low) < 0) {
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ }
+
+ /* Are we just counting mappings? */
+ if (info->head->fmh_count == 0) {
+ if (rec_daddr > info->next_daddr)
+ info->head->fmh_entries++;
+
+ if (info->last)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ info->head->fmh_entries++;
+
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ }
+
+ /*
+ * If the record starts past the last physical block we saw,
+ * then we've found a gap. Report the gap as being owned by
+ * whatever the caller specified is the missing owner.
+ */
+ if (rec_daddr > info->next_daddr) {
+ if (info->head->fmh_entries >= info->head->fmh_count)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
+ fmr.fmr_device = info->dev;
+ fmr.fmr_physical = info->next_daddr;
+ fmr.fmr_owner = info->missing_owner;
+ fmr.fmr_offset = 0;
+ fmr.fmr_length = rec_daddr - info->next_daddr;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ error = info->formatter(&fmr, info->format_arg);
+ if (error)
+ return error;
+ info->head->fmh_entries++;
+ }
+
+ if (info->last)
+ goto out;
+
+ /* Fill out the extent we found */
+ if (info->head->fmh_entries >= info->head->fmh_count)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
+ trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
+
+ fmr.fmr_device = info->dev;
+ fmr.fmr_physical = rec_daddr;
+ error = xfs_fsmap_owner_from_rmap(&fmr, rec);
+ if (error)
+ return error;
+ fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
+ fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
+ fmr.fmr_flags |= FMR_OF_PREALLOC;
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+ fmr.fmr_flags |= FMR_OF_ATTR_FORK;
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ fmr.fmr_flags |= FMR_OF_EXTENT_MAP;
+ if (fmr.fmr_flags == 0) {
+ error = xfs_getfsmap_is_shared(tp, info, rec, &shared);
+ if (error)
+ return error;
+ if (shared)
+ fmr.fmr_flags |= FMR_OF_SHARED;
+ }
+ error = info->formatter(&fmr, info->format_arg);
+ if (error)
+ return error;
+ info->head->fmh_entries++;
+
+out:
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+}
+
+/* Transform a rmapbt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_getfsmap_info *info = priv;
+ xfs_fsblock_t fsb;
+ xfs_daddr_t rec_daddr;
+
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+ rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
+
+ return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
+}
+
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
+/* Transform a bnobt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_bnobt_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+ rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr);
+}
+
+/* Set rmap flags based on the getfsmap flags */
+static void
+xfs_getfsmap_set_irec_flags(
+ struct xfs_rmap_irec *irec,
+ struct xfs_fsmap *fmr)
+{
+ irec->rm_flags = 0;
+ if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
+ irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (fmr->fmr_flags & FMR_OF_EXTENT_MAP)
+ irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+ if (fmr->fmr_flags & FMR_OF_PREALLOC)
+ irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+}
+
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rmap_irec rmap;
+ int error;
+
+ /* Set up search keys */
+ info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, keys);
+ if (error)
+ return error;
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1);
+ if (error)
+ return error;
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+ info->missing_owner = XFS_FMR_OWN_FREE;
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+ if (keys[0].fmr_physical > 0)
+ return 0;
+
+ /* Fabricate an rmap entry for the external log device. */
+ rmap.rm_startblock = 0;
+ rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+ rmap.rm_owner = XFS_RMAP_OWN_LOG;
+ rmap.rm_offset = 0;
+ rmap.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &rmap, 0);
+}
+
+/* Execute a getfsmap query against the realtime device. */
+STATIC int
+__xfs_getfsmap_rtdev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ int (*query_fn)(struct xfs_trans *,
+ struct xfs_getfsmap_info *),
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsblock_t start_fsb;
+ xfs_fsblock_t end_fsb;
+ xfs_daddr_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+ end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical);
+
+ /* Set up search keys */
+ info->low.rm_startblock = start_fsb;
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ info->high.rm_startblock = end_fsb;
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ return error;
+ info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+ info->high.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+ return query_fn(tp, info);
+}
+
+/* Actually query the realtime bitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_rtalloc_rec alow;
+ struct xfs_rtalloc_rec ahigh;
+ int error;
+
+ xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+
+ alow.ar_startblock = info->low.rm_startblock;
+ ahigh.ar_startblock = info->high.rm_startblock;
+ error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ xfs_getfsmap_rtdev_rtbitmap_helper, info);
+ if (error)
+ goto err;
+
+ /* Report any gaps at the end of the rtbitmap */
+ info->last = true;
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ if (error)
+ goto err;
+err:
+ xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
+ info);
+}
+
+/* Execute a getfsmap query against the regular data device. */
+STATIC int
+__xfs_getfsmap_datadev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info,
+ int (*query_fn)(struct xfs_trans *,
+ struct xfs_getfsmap_info *,
+ struct xfs_btree_cur **,
+ void *),
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *bt_cur = NULL;
+ xfs_fsblock_t start_fsb;
+ xfs_fsblock_t end_fsb;
+ xfs_agnumber_t start_ag;
+ xfs_agnumber_t end_ag;
+ xfs_daddr_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
+ end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical);
+
+ /*
+ * Convert the fsmap low/high keys to AG based keys. Initialize
+ * low to the fsmap low key and max out the high key to the end
+ * of the AG.
+ */
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+ start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
+ end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
+
+ /* Query each AG */
+ for (info->agno = start_ag; info->agno <= end_ag; info->agno++) {
+ /*
+ * Set the AG high key from the fsmap high key if this
+ * is the last AG that we're querying.
+ */
+ if (info->agno == end_ag) {
+ info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
+ end_fsb);
+ info->high.rm_offset = XFS_BB_TO_FSBT(mp,
+ keys[1].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ goto err;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+ }
+
+ if (bt_cur) {
+ xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+ bt_cur = NULL;
+ xfs_trans_brelse(tp, info->agf_bp);
+ info->agf_bp = NULL;
+ }
+
+ error = xfs_alloc_read_agf(mp, tp, info->agno, 0,
+ &info->agf_bp);
+ if (error)
+ goto err;
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno,
+ &info->high);
+
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ goto err;
+
+ /*
+ * Set the AG low key to the start of the AG prior to
+ * moving on to the next AG.
+ */
+ if (info->agno == start_ag) {
+ info->low.rm_startblock = 0;
+ info->low.rm_owner = 0;
+ info->low.rm_offset = 0;
+ info->low.rm_flags = 0;
+ }
+ }
+
+ /* Report any gap at the end of the AG */
+ info->last = true;
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ goto err;
+
+err:
+ if (bt_cur)
+ xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ if (info->agf_bp) {
+ xfs_trans_brelse(tp, info->agf_bp);
+ info->agf_bp = NULL;
+ }
+
+ return error;
+}
+
+/* Actually query the rmap btree. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp,
+ void *priv)
+{
+ /* Report any gap at the end of the last AG. */
+ if (info->last)
+ return xfs_getfsmap_datadev_helper(*curpp, &info->high, info);
+
+ /* Allocate cursor for this AG and query_range it. */
+ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->agno);
+ return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+ xfs_getfsmap_datadev_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device rmapbt. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ info->missing_owner = XFS_FMR_OWN_FREE;
+ return __xfs_getfsmap_datadev(tp, keys, info,
+ xfs_getfsmap_datadev_rmapbt_query, NULL);
+}
+
+/* Actually query the bno btree. */
+STATIC int
+xfs_getfsmap_datadev_bnobt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *key = priv;
+
+ /* Report any gap at the end of the last AG. */
+ if (info->last)
+ return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
+
+ /* Allocate cursor for this AG and query_range it. */
+ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->agno, XFS_BTNUM_BNO);
+ key->ar_startblock = info->low.rm_startblock;
+ key[1].ar_startblock = info->high.rm_startblock;
+ return xfs_alloc_query_range(*curpp, key, &key[1],
+ xfs_getfsmap_datadev_bnobt_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device's bnobt. */
+STATIC int
+xfs_getfsmap_datadev_bnobt(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_alloc_rec_incore akeys[2];
+
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ return __xfs_getfsmap_datadev(tp, keys, info,
+ xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
+}
+
+/* Do we recognize the device? */
+STATIC bool
+xfs_getfsmap_is_valid_device(
+ struct xfs_mount *mp,
+ struct xfs_fsmap *fm)
+{
+ if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
+ return true;
+ if (mp->m_logdev_targp &&
+ fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
+ return true;
+ if (mp->m_rtdev_targp &&
+ fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
+ return true;
+ return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+STATIC bool
+xfs_getfsmap_check_keys(
+ struct xfs_fsmap *low_key,
+ struct xfs_fsmap *high_key)
+{
+ if (low_key->fmr_device > high_key->fmr_device)
+ return false;
+ if (low_key->fmr_device < high_key->fmr_device)
+ return true;
+
+ if (low_key->fmr_physical > high_key->fmr_physical)
+ return false;
+ if (low_key->fmr_physical < high_key->fmr_physical)
+ return true;
+
+ if (low_key->fmr_owner > high_key->fmr_owner)
+ return false;
+ if (low_key->fmr_owner < high_key->fmr_owner)
+ return true;
+
+ if (low_key->fmr_offset > high_key->fmr_offset)
+ return false;
+ if (low_key->fmr_offset < high_key->fmr_offset)
+ return true;
+
+ return false;
+}
+
+#define XFS_GETFSMAP_DEVS 3
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output. Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in;
+ * these reflect fs-wide sector addrs.
+ * dkeys -- fmh_keys used to query each device;
+ * these are fmh_keys but w/ the low key
+ * bumped up by fmr_length.
+ * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this
+ * is how we detect gaps in the fsmap
+ records and report them.
+ * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from
+ * dkeys; used to query the metadata.
+ */
+int
+xfs_getfsmap(
+ struct xfs_mount *mp,
+ struct xfs_fsmap_head *head,
+ xfs_fsmap_format_t formatter,
+ void *arg)
+{
+ struct xfs_trans *tp = NULL;
+ struct xfs_fsmap dkeys[2]; /* per-dev keys */
+ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS];
+ struct xfs_getfsmap_info info = { NULL };
+ int i;
+ int error = 0;
+
+ if (head->fmh_iflags & ~FMH_IF_VALID)
+ return -EINVAL;
+ if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
+ !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ head->fmh_entries = 0;
+
+ /* Set up our device handlers. */
+ memset(handlers, 0, sizeof(handlers));
+ handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+ else
+ handlers[0].fn = xfs_getfsmap_datadev_bnobt;
+ if (mp->m_logdev_targp != mp->m_ddev_targp) {
+ handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+ handlers[1].fn = xfs_getfsmap_logdev;
+ }
+ if (mp->m_rtdev_targp) {
+ handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+ }
+
+ xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
+ xfs_getfsmap_dev_compare);
+
+ /*
+ * To continue where we left off, we allow userspace to use the
+ * last mapping from a previous call as the low key of the next.
+ * This is identified by a non-zero length in the low key. We
+ * have to increment the low key in this scenario to ensure we
+ * don't return the same mapping again, and instead return the
+ * very next mapping.
+ *
+ * If the low key mapping refers to file data, the same physical
+ * blocks could be mapped to several other files/offsets.
+ * According to rmapbt record ordering, the minimal next
+ * possible record for the block range is the next starting
+ * offset in the same inode. Therefore, bump the file offset to
+ * continue the search appropriately. For all other low key
+ * mapping types (attr blocks, metadata), bump the physical
+ * offset as there can be no other mapping for the same physical
+ * block range.
+ */
+ dkeys[0] = head->fmh_keys[0];
+ if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
+ dkeys[0].fmr_physical += dkeys[0].fmr_length;
+ dkeys[0].fmr_owner = 0;
+ if (dkeys[0].fmr_offset)
+ return -EINVAL;
+ } else
+ dkeys[0].fmr_offset += dkeys[0].fmr_length;
+ dkeys[0].fmr_length = 0;
+ memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
+
+ if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ info.next_daddr = head->fmh_keys[0].fmr_physical +
+ head->fmh_keys[0].fmr_length;
+ info.formatter = formatter;
+ info.format_arg = arg;
+ info.head = head;
+
+ /* For each device we support... */
+ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
+ /* Is this device within the range the user asked for? */
+ if (!handlers[i].fn)
+ continue;
+ if (head->fmh_keys[0].fmr_device > handlers[i].dev)
+ continue;
+ if (head->fmh_keys[1].fmr_device < handlers[i].dev)
+ break;
+
+ /*
+ * If this device number matches the high key, we have
+ * to pass the high key to the handler to limit the
+ * query results. If the device number exceeds the
+ * low key, zero out the low key so that we get
+ * everything from the beginning.
+ */
+ if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+ dkeys[1] = head->fmh_keys[1];
+ if (handlers[i].dev > head->fmh_keys[0].fmr_device)
+ memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ break;
+
+ info.dev = handlers[i].dev;
+ info.last = false;
+ info.agno = NULLAGNUMBER;
+ error = handlers[i].fn(tp, dkeys, &info);
+ if (error)
+ break;
+ xfs_trans_cancel(tp);
+ tp = NULL;
+ info.next_daddr = 0;
+ }
+
+ if (tp)
+ xfs_trans_cancel(tp);
+ head->fmh_oflags = FMH_OF_DEV_T;
+ return error;
+}
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
new file mode 100644
index 000000000000..0b9bf822595c
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_FSMAP_H__
+#define __XFS_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct xfs_fsmap {
+ dev_t fmr_device; /* device id */
+ uint32_t fmr_flags; /* mapping flags */
+ uint64_t fmr_physical; /* device offset of segment */
+ uint64_t fmr_owner; /* owner id */
+ xfs_fileoff_t fmr_offset; /* file offset of segment */
+ xfs_filblks_t fmr_length; /* length of segment, blocks */
+};
+
+struct xfs_fsmap_head {
+ uint32_t fmh_iflags; /* control flags */
+ uint32_t fmh_oflags; /* output flags */
+ unsigned int fmh_count; /* # of entries in array incl. input */
+ unsigned int fmh_entries; /* # of entries filled in (output). */
+
+ struct xfs_fsmap fmh_keys[2]; /* low and high keys */
+};
+
+void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
+void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
+
+int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
+ xfs_fsmap_format_t formatter, void *arg);
+
+#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3531f8f72fa5..f61c84f8e31a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag(
xfs_perag_clear_reclaim_tag(pag);
}
+static void
+xfs_inew_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (!xfs_iflags_test(ip, XFS_INEW))
+ break;
+ schedule();
+ } while (true);
+ finish_wait(wq, &wait.wait);
+}
+
/*
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
* part of the structure. This is made more complex by the fact we store
@@ -366,14 +382,17 @@ xfs_iget_cache_hit(
error = xfs_reinit_inode(mp, inode);
if (error) {
+ bool wake;
/*
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
rcu_read_lock();
spin_lock(&ip->i_flags_lock);
-
+ wake = !!__xfs_iflags_test(ip, XFS_INEW);
ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ if (wake)
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
trace_xfs_iget_reclaim_fail(ip);
goto out_error;
@@ -623,9 +642,11 @@ out_error_or_again:
STATIC int
xfs_inode_ag_walk_grab(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ int flags)
{
struct inode *inode = VFS_I(ip);
+ bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
ASSERT(rcu_read_lock_held());
@@ -643,7 +664,8 @@ xfs_inode_ag_walk_grab(
goto out_unlock_noent;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
+ __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock_noent;
spin_unlock(&ip->i_flags_lock);
@@ -671,7 +693,8 @@ xfs_inode_ag_walk(
void *args),
int flags,
void *args,
- int tag)
+ int tag,
+ int iter_flags)
{
uint32_t first_index;
int last_error = 0;
@@ -713,7 +736,7 @@ restart:
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_inode_ag_walk_grab(ip))
+ if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
batch[i] = NULL;
/*
@@ -741,6 +764,9 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
+ if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
+ xfs_iflags_test(batch[i], XFS_INEW))
+ xfs_inew_wait(batch[i]);
error = execute(batch[i], flags, args);
IRELE(batch[i]);
if (error == -EAGAIN) {
@@ -820,12 +846,13 @@ xfs_cowblocks_worker(
}
int
-xfs_inode_ag_iterator(
+xfs_inode_ag_iterator_flags(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
- void *args)
+ void *args,
+ int iter_flags)
{
struct xfs_perag *pag;
int error = 0;
@@ -835,7 +862,8 @@ xfs_inode_ag_iterator(
ag = 0;
while ((pag = xfs_perag_get(mp, ag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
+ iter_flags);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -847,6 +875,17 @@ xfs_inode_ag_iterator(
}
int
+xfs_inode_ag_iterator(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int flags,
+ void *args)
+{
+ return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
+}
+
+int
xfs_inode_ag_iterator_tag(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags,
@@ -863,7 +902,8 @@ xfs_inode_ag_iterator_tag(
ag = 0;
while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
+ 0);
xfs_perag_put(pag);
if (error) {
last_error = error;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a7c849b4dea..9183f77958ef 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -48,6 +48,11 @@ struct xfs_eofblocks {
#define XFS_IGET_UNTRUSTED 0x2
#define XFS_IGET_DONTCACHE 0x4
+/*
+ * flags for AG inode iterator
+ */
+#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */
+
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
@@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
+int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
+ int flags, void *args, int iter_flags);
int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args, int tag);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7605d8396596..ec9826c56500 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1906,12 +1906,13 @@ xfs_inactive(
* force is true because we are evicting an inode from the
* cache. Post-eof blocks must be freed, lest we end up with
* broken free space accounting.
+ *
+ * Note: don't bother with iolock here since lockdep complains
+ * about acquiring it in reclaim context. We have the only
+ * reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (xfs_can_free_eofblocks(ip, true))
xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- }
return;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 10dcf27b4c85..10e89fcb49d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
-#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define __XFS_INEW_BIT 3 /* inode has just been allocated */
+#define XFS_INEW (1 << __XFS_INEW_BIT)
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
@@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
xfs_iflags_clear(ip, XFS_INEW);
barrier();
unlock_new_inode(VFS_I(ip));
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
}
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e7811ccdd..08cb7d1a4a3a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -731,22 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- struct xfs_log_item *log_items[need_ail];
- int i = 0;
+ bool mlip_changed = false;
+
+ /* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->xa_lock);
for (blip = lip; blip; blip = blip->li_bio_list) {
- iip = INODE_ITEM(blip);
- if (iip->ili_logged &&
- blip->li_lsn == iip->ili_flush_lsn) {
- log_items[i++] = blip;
- }
- ASSERT(i <= need_ail);
+ if (INODE_ITEM(blip)->ili_logged &&
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+ mlip_changed |= xfs_ail_delete_one(ailp, blip);
}
- /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
- xfs_trans_ail_delete_bulk(ailp, log_items, i,
- SHUTDOWN_CORRUPT_INCORE);
- }
+ if (mlip_changed) {
+ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+ xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (list_empty(&ailp->xa_ail))
+ wake_up_all(&ailp->xa_empty);
+ }
+ spin_unlock(&ailp->xa_lock);
+
+ if (mlip_changed)
+ xfs_log_space_wake(ailp->xa_mount);
+ }
/*
* clean up and unlock the flush lock now we are done. We can clear the
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2fd7fdf5438f..6190697603c9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,9 @@
#include "xfs_trans.h"
#include "xfs_pnfs.h"
#include "xfs_acl.h"
+#include "xfs_btree.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
#include <linux/capability.h>
#include <linux/cred.h>
@@ -1543,10 +1546,11 @@ xfs_ioc_getbmap(
unsigned int cmd,
void __user *arg)
{
- struct getbmapx bmx;
+ struct getbmapx bmx = { 0 };
int error;
- if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+ /* struct getbmap is a strict subset of struct getbmapx. */
+ if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
return -EFAULT;
if (bmx.bmv_count < 2)
@@ -1608,6 +1612,84 @@ xfs_ioc_getbmapx(
return 0;
}
+struct getfsmap_info {
+ struct xfs_mount *mp;
+ struct fsmap_head __user *data;
+ unsigned int idx;
+ __u32 last_flags;
+};
+
+STATIC int
+xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
+{
+ struct getfsmap_info *info = priv;
+ struct fsmap fm;
+
+ trace_xfs_getfsmap_mapping(info->mp, xfm);
+
+ info->last_flags = xfm->fmr_flags;
+ xfs_fsmap_from_internal(&fm, xfm);
+ if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
+ sizeof(struct fsmap)))
+ return -EFAULT;
+
+ return 0;
+}
+
+STATIC int
+xfs_ioc_getfsmap(
+ struct xfs_inode *ip,
+ struct fsmap_head __user *arg)
+{
+ struct getfsmap_info info = { NULL };
+ struct xfs_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ bool aborted = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xhead.fmh_count = head.fmh_count;
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
+ xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
+
+ info.mp = ip->i_mount;
+ info.data = arg;
+ error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ error = 0;
+ aborted = true;
+ } else if (error)
+ return error;
+
+ /* If we didn't abort, set the "last" flag in the last fmx */
+ if (!aborted && info.idx) {
+ info.last_flags |= FMR_OF_LAST;
+ if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
+ &info.last_flags, sizeof(info.last_flags)))
+ return -EFAULT;
+ }
+
+ /* copy back header */
+ head.fmh_entries = xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+ return -EFAULT;
+
+ return 0;
+}
+
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -1788,6 +1870,9 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAPX:
return xfs_ioc_getbmapx(ip, arg);
+ case FS_IOC_GETFSMAP:
+ return xfs_ioc_getfsmap(ip, arg);
+
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
case XFS_IOC_PATH_TO_FSHANDLE: {
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7c49938c5aed..fa0bc4d46065 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -20,6 +20,7 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/fsmap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
@@ -554,6 +555,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_GOINGDOWN:
case XFS_IOC_ERROR_INJECTION:
case XFS_IOC_ERROR_CLEARALL:
+ case FS_IOC_GETFSMAP:
return xfs_file_ioctl(filp, cmd, p);
#ifndef BROKEN_X86_ALIGNMENT
/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..a63f61c256bd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -240,7 +240,7 @@ xfs_iomap_write_direct(
*/
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
- if (ISUNWRITTEN(imap)) {
+ if (imap->br_state == XFS_EXT_UNWRITTEN) {
tflags |= XFS_TRANS_RESERVE;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
@@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode,
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
imap->br_startblock == DELAYSTARTBLOCK ||
- (IS_DAX(inode) && ISUNWRITTEN(imap));
+ (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
}
static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false, trimmed = false;
unsigned lockmode;
+ struct block_device *bdev;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
}
xfs_bmbt_to_iomap(ip, iomap, &imap);
+
+ /* optionally associate a dax device with the iomap bdev */
+ bdev = iomap->bdev;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
+
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
unsigned flags,
struct iomap *iomap)
{
+ put_dax(iomap->dax_dev);
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
length, written, iomap);
@@ -1170,10 +1180,10 @@ xfs_xattr_iomap_begin(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- lockmode = xfs_ilock_data_map_shared(ip);
+ lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
error = -ENOENT;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 592fdf7111cb..044fb0e15390 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
-
-/* Move the kernel do_div definition off to one side */
-
-#if defined __i386__
-/* For ia32 we need to pull some tricks to get past various versions
- * of the compiler which do not like us using do_div in the middle
- * of large functions.
- */
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
- __u32 mod;
-
- switch (n) {
- case 4:
- mod = *(__u32 *)a % b;
- *(__u32 *)a = *(__u32 *)a / b;
- return mod;
- case 8:
- {
- unsigned long __upper, __low, __high, __mod;
- __u64 c = *(__u64 *)a;
- __upper = __high = c >> 32;
- __low = c;
- if (__high) {
- __upper = __high % (b);
- __high = __high / (b);
- }
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
- asm("":"=A" (c):"a" (__low),"d" (__high));
- *(__u64 *)a = c;
- return __mod;
- }
- }
-
- /* NOTREACHED */
- return 0;
-}
-
-/* Side effect free 64 bit mod operation */
-static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
-{
- switch (n) {
- case 4:
- return *(__u32 *)a % b;
- case 8:
- {
- unsigned long __upper, __low, __high, __mod;
- __u64 c = *(__u64 *)a;
- __upper = __high = c >> 32;
- __low = c;
- if (__high) {
- __upper = __high % (b);
- __high = __high / (b);
- }
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
- asm("":"=A" (c):"a" (__low),"d" (__high));
- return __mod;
- }
- }
-
- /* NOTREACHED */
- return 0;
-}
-#else
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
- __u32 mod;
-
- switch (n) {
- case 4:
- mod = *(__u32 *)a % b;
- *(__u32 *)a = *(__u32 *)a / b;
- return mod;
- case 8:
- mod = do_div(*(__u64 *)a, b);
- return mod;
- }
-
- /* NOTREACHED */
- return 0;
-}
-
/* Side effect free 64 bit mod operation */
static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
{
@@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
/* NOTREACHED */
return 0;
}
-#endif
-#undef do_div
-#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b1469f0a91a6..3731f13f63e9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1293,7 +1293,7 @@ void
xfs_log_work_queue(
struct xfs_mount *mp)
{
- queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+ queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work,
msecs_to_jiffies(xfs_syncd_centisecs * 10));
}
@@ -1852,7 +1852,7 @@ xlog_sync(
*/
if (log->l_badcrc_factor &&
(prandom_u32() % log->l_badcrc_factor == 0)) {
- iclog->ic_header.h_crc &= 0xAAAAAAAA;
+ iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
iclog->ic_state |= XLOG_STATE_IOABORT;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4a98762ec8b4..cd0b077deb35 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3796,7 +3796,7 @@ xlog_recover_bud_pass2(
* This routine is called when an inode create format structure is found in a
* committed transaction in the log. It's purpose is to initialise the inodes
* being allocated on disk. This requires us to get inode cluster buffers that
- * match the range to be intialised, stamped with inode templates and written
+ * match the range to be initialised, stamped with inode templates and written
* by delayed write so that subsequent modifications will hit the cached buffer
* and only need writing out at the end of recovery.
*/
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 688ebff1f663..2eaf81859166 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -73,6 +73,10 @@ xfs_uuid_mount(
uuid_t *uuid = &mp->m_sb.sb_uuid;
int hole, i;
+ /* Publish UUID in struct super_block */
+ BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t));
+ memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t));
+
if (mp->m_flags & XFS_MOUNT_NOUUID)
return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 6db6fd6b82b0..9fa312a41c93 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_reclaim_workqueue;
struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
+ struct workqueue_struct *m_sync_workqueue;
/*
* Generation of the filesysyem layout. This is incremented by each
@@ -312,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
static inline xfs_agnumber_t
xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
{
- xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
do_div(ld, mp->m_sb.sb_agblocks);
return (xfs_agnumber_t) ld;
}
@@ -320,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
static inline xfs_agblock_t
xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
{
- xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b669b123287b..5fe6e70b88ef 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts(
* started afresh by xfs_qm_quotacheck.
*/
#ifdef DEBUG
- j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- do_div(j, sizeof(xfs_dqblk_t));
+ j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
+ sizeof(xfs_dqblk_t);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
@@ -1384,12 +1384,7 @@ xfs_qm_quotacheck(
mp->m_qflags |= flags;
error_return:
- while (!list_empty(&buffer_list)) {
- struct xfs_buf *bp =
- list_first_entry(&buffer_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
- xfs_buf_relse(bp);
- }
+ xfs_buf_delwri_cancel(&buffer_list);
if (error) {
xfs_warn(mp,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 475a3882a81f..9cb5c381b01c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
+ xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
+ XFS_AGITER_INEW_WAIT);
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 6e4c7446c3d4..96fe209b5eb6 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -221,6 +221,7 @@ void
xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
+ ASSERT(atomic_read(&cuip->cui_refcount) > 0);
if (atomic_dec_and_test(&cuip->cui_refcount)) {
xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_cui_item_free(cuip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4a84c5ea266d..ffe6fe7a7eb5 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) ||
- ISUNWRITTEN(irec) ||
- irec->br_startblock == HOLESTARTBLOCK ||
- irec->br_startblock == DELAYSTARTBLOCK ||
- isnullstartblock(irec->br_startblock)) {
+ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -709,8 +705,22 @@ xfs_reflink_end_cow(
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
- /* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ /*
+ * Start a rolling transaction to switch the mappings. We're
+ * unlikely ever to have to remap 16T worth of single-block
+ * extents, so just cap the worst case extent count to 2^32-1.
+ * Stick a warning in just in case, and avoid 64-bit division.
+ */
+ BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
+ if (end_fsb - offset_fsb > UINT_MAX) {
+ error = -EFSCORRUPTED;
+ xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(0);
+ goto out;
+ }
+ resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
+ (unsigned int)(end_fsb - offset_fsb),
+ XFS_DATA_FORK);
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
resblks, 0, 0, &tp);
if (error)
@@ -1045,12 +1055,12 @@ xfs_reflink_remap_extent(
xfs_off_t new_isize)
{
struct xfs_mount *mp = ip->i_mount;
+ bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
xfs_fsblock_t firstfsb;
unsigned int resblks;
struct xfs_defer_ops dfops;
struct xfs_bmbt_irec uirec;
- bool real_extent;
xfs_filblks_t rlen;
xfs_filblks_t unmap_len;
xfs_off_t newlen;
@@ -1059,11 +1069,6 @@ xfs_reflink_remap_extent(
unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
- /* Only remap normal extents. */
- real_extent = (irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(irec));
-
/* No reflinking if we're low on space */
if (real_extent) {
error = xfs_reflink_ag_has_free_space(mp,
@@ -1359,9 +1364,7 @@ xfs_reflink_dirty_extents(
goto out;
if (nmaps == 0)
break;
- if (map[0].br_startblock == HOLESTARTBLOCK ||
- map[0].br_startblock == DELAYSTARTBLOCK ||
- ISUNWRITTEN(&map[0]))
+ if (!xfs_bmap_is_real_extent(&map[0]))
goto next;
map[1] = map[0];
@@ -1435,9 +1438,7 @@ xfs_reflink_clear_inode_flag(
return error;
if (nmaps == 0)
break;
- if (map.br_startblock == HOLESTARTBLOCK ||
- map.br_startblock == DELAYSTARTBLOCK ||
- ISUNWRITTEN(&map))
+ if (!xfs_bmap_is_real_extent(&map))
goto next;
agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 73c827831551..f3b139c9aa16 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -243,6 +243,7 @@ void
xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
+ ASSERT(atomic_read(&ruip->rui_refcount) > 0);
if (atomic_dec_and_test(&ruip->rui_refcount)) {
xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_rui_item_free(ruip);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 51dd3c726608..f13133e6f19f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,6 +23,16 @@
struct xfs_mount;
struct xfs_trans;
+struct xfs_rtalloc_rec {
+ xfs_rtblock_t ar_startblock;
+ xfs_rtblock_t ar_blockcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv);
+
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-
-
+int xfs_rtalloc_query_range(struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *low_rec,
+ struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+int xfs_rtalloc_query_all(struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
+# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 685c042a120f..47d239dcf3f4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -877,8 +877,15 @@ xfs_init_mount_workqueues(
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
+ mp->m_fsname);
+ if (!mp->m_sync_workqueue)
+ goto out_destroy_eofb;
+
return 0;
+out_destroy_eofb:
+ destroy_workqueue(mp->m_eofblocks_workqueue);
out_destroy_log:
destroy_workqueue(mp->m_log_workqueue);
out_destroy_reclaim:
@@ -899,6 +906,7 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_sync_workqueue);
destroy_workqueue(mp->m_eofblocks_workqueue);
destroy_workqueue(mp->m_log_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 7f17ae6d709a..5d95fe348294 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -47,6 +47,7 @@
#include "xfs_inode_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_filestream.h"
+#include "xfs_fsmap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 383ac227ce2c..7c5a16528d8b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -40,6 +40,8 @@ struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
struct xfs_refcount_irec;
+struct xfs_fsmap;
+struct xfs_rmap_irec;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -2190,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2253,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
- __field(bool, committed)
- __field(bool, low)
+ __field(char, committed)
+ __field(char, low)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
@@ -2262,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
__entry->committed = dop->dop_committed;
__entry->low = dop->dop_low;
),
- TP_printk("dev %d:%d ops %p committed %d low %d\n",
+ TP_printk("dev %d:%d ops %p committed %d low %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
@@ -2279,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
- __field(bool, committed)
- __field(bool, low)
+ __field(char, committed)
+ __field(char, low)
__field(int, error)
),
TP_fast_assign(
@@ -2290,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
__entry->low = dop->dop_low;
__entry->error = error;
),
- TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+ TP_printk("dev %d:%d ops %p committed %d low %d err %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
@@ -2309,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__field(dev_t, dev)
__field(int, type)
__field(void *, intent)
- __field(bool, committed)
+ __field(char, committed)
__field(int, nr)
),
TP_fast_assign(
@@ -2319,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",
+ TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->intent,
@@ -2614,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
__entry->asked = r ? r->ar_asked : 0;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+ TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u "
+ "resv %u ask %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->resv,
@@ -2667,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__entry->agbno = agbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+ TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2700,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2735,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2776,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__entry->i2_refcount = i2->rc_refcount;
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u\n",
+ "agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2822,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->agbno = agbno;
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u @ agbno %u\n",
+ "agbno %u len %u refcount %u @ agbno %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2875,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
"agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u\n",
+ "agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -3001,31 +3004,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \
unsigned long caller_ip), \
TP_ARGS(ip, error, caller_ip))
-/* reflink allocator */
-TRACE_EVENT(xfs_bmap_remap_alloc,
- TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
- xfs_extlen_t len),
- TP_ARGS(ip, fsbno, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fsblock_t, fsbno)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->fsbno = fsbno;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->fsbno,
- __entry->len)
-);
-DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
-
/* reflink tracepoint classes */
/* two-file io tracepoint class */
@@ -3227,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone,
),
TP_printk("dev %d:%d "
"ino 0x%lx isize 0x%llx -> "
- "ino 0x%lx isize 0x%llx\n",
+ "ino 0x%lx isize 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_ino,
__entry->src_isize,
@@ -3267,6 +3245,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+/* fsmap traces */
+DECLARE_EVENT_CLASS(xfs_fsmap_class,
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
+ struct xfs_rmap_irec *rmap),
+ TP_ARGS(mp, keydev, agno, rmap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_fsblock_t, bno)
+ __field(xfs_filblks_t, len)
+ __field(__uint64_t, owner)
+ __field(__uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(keydev);
+ __entry->agno = agno;
+ __entry->bno = rmap->rm_startblock;
+ __entry->len = rmap->rm_blockcount;
+ __entry->owner = rmap->rm_owner;
+ __entry->offset = rmap->rm_offset;
+ __entry->flags = rmap->rm_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->agno,
+ __entry->bno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+)
+#define DEFINE_FSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
+ struct xfs_rmap_irec *rmap), \
+ TP_ARGS(mp, keydev, agno, rmap))
+DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+
+DECLARE_EVENT_CLASS(xfs_getfsmap_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
+ TP_ARGS(mp, fsmap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_daddr_t, block)
+ __field(xfs_daddr_t, len)
+ __field(__uint64_t, owner)
+ __field(__uint64_t, offset)
+ __field(__uint64_t, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(fsmap->fmr_device);
+ __entry->block = fsmap->fmr_physical;
+ __entry->len = fsmap->fmr_length;
+ __entry->owner = fsmap->fmr_owner;
+ __entry->offset = fsmap->fmr_offset;
+ __entry->flags = fsmap->fmr_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->block,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+)
+#define DEFINE_GETFSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_getfsmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \
+ TP_ARGS(mp, fsmap))
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..2011620008de 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
/*
* Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
if (blocks > 0) {
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
}
tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
tp->t_blk_res = 0;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return error;
}
@@ -263,6 +263,28 @@ xfs_trans_alloc(
}
/*
+ * Create an empty transaction with no reservation. This is a defensive
+ * mechanism for routines that query metadata without actually modifying
+ * them -- if the metadata being queried is somehow cross-linked (think a
+ * btree block pointer that points higher in the tree), we risk deadlock.
+ * However, blocks grabbed as part of a transaction can be re-grabbed.
+ * The verifiers will notice the corrupt block and the operation will fail
+ * back to userspace without deadlocking.
+ *
+ * Note the zero-length reservation; this transaction MUST be cancelled
+ * without any dirty data.
+ */
+int
+xfs_trans_alloc_empty(
+ struct xfs_mount *mp,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans_res resv = {0};
+
+ return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+}
+
+/*
* Record the indicated change to the given field for application
* to the file system's superblock when the transaction commits.
* For now, just store the change in the transaction structure.
@@ -914,7 +936,7 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -944,7 +966,7 @@ out_unreserve:
if (commit_lsn == -1 && !error)
error = -EIO;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
@@ -998,7 +1020,7 @@ xfs_trans_cancel(
xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
@@ -1012,17 +1034,14 @@ xfs_trans_cancel(
* chunk we've been working on and get a new transaction to continue.
*/
int
-__xfs_trans_roll(
+xfs_trans_roll(
struct xfs_trans **tpp,
- struct xfs_inode *dp,
- int *committed)
+ struct xfs_inode *dp)
{
struct xfs_trans *trans;
struct xfs_trans_res tres;
int error;
- *committed = 0;
-
/*
* Ensure that the inode is always logged.
*/
@@ -1048,7 +1067,6 @@ __xfs_trans_roll(
if (error)
return error;
- *committed = 1;
trans = *tpp;
/*
@@ -1071,12 +1089,3 @@ __xfs_trans_roll(
xfs_trans_ijoin(trans, dp, 0);
return 0;
}
-
-int
-xfs_trans_roll(
- struct xfs_trans **tpp,
- struct xfs_inode *dp)
-{
- int committed;
- return __xfs_trans_roll(tpp, dp, &committed);
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 1646f659b60f..a07acbf0bd8a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -158,6 +158,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_alloc_empty(struct xfs_mount *mp,
+ struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
@@ -226,7 +228,6 @@ int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
xfs_extlen_t, struct xfs_owner_info *);
int xfs_trans_commit(struct xfs_trans *);
-int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d6c9c3e9e02b..9056c0f34a3c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
}
}
-/*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+bool
+xfs_ail_delete_one(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_log_item *mlip = xfs_ail_min(ailp);
+
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+ xfs_ail_delete(ailp, lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+
+ return mlip == lip;
+}
+
+/**
+ * Remove a log items from the AIL
*
* @xfs_trans_ail_delete_bulk takes an array of log items that all need to
* removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
* before returning.
*/
void
-xfs_trans_ail_delete_bulk(
+xfs_trans_ail_delete(
struct xfs_ail *ailp,
- struct xfs_log_item **log_items,
- int nr_items,
+ struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->xa_lock)
{
- xfs_log_item_t *mlip;
- int mlip_changed = 0;
- int i;
+ struct xfs_mount *mp = ailp->xa_mount;
+ bool mlip_changed;
- mlip = xfs_ail_min(ailp);
-
- for (i = 0; i < nr_items; i++) {
- struct xfs_log_item *lip = log_items[i];
- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
- struct xfs_mount *mp = ailp->xa_mount;
-
- spin_unlock(&ailp->xa_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
- "%s: attempting to delete a log item that is not in the AIL",
- __func__);
- xfs_force_shutdown(mp, shutdown_type);
- }
- return;
+ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ "%s: attempting to delete a log item that is not in the AIL",
+ __func__);
+ xfs_force_shutdown(mp, shutdown_type);
}
-
- trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
- xfs_ail_delete(ailp, lip);
- lip->li_flags &= ~XFS_LI_IN_AIL;
- lip->li_lsn = 0;
- if (mlip == lip)
- mlip_changed = 1;
+ return;
}
+ mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->xa_ail))
wake_up_all(&ailp->xa_empty);
- spin_unlock(&ailp->xa_lock);
+ }
+ spin_unlock(&ailp->xa_lock);
+ if (mlip_changed)
xfs_log_space_wake(ailp->xa_mount);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 49931b72da8a..d91706c56c63 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,18 +106,9 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
- struct xfs_log_item **log_items, int nr_items,
- int shutdown_type)
- __releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip,
- int shutdown_type) __releases(ailp->xa_lock)
-{
- xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
-}
+bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
+ int shutdown_type) __releases(ailp->xa_lock);
static inline void
xfs_trans_ail_remove(