aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile2
-rw-r--r--fs/autofs4/autofs_i.h72
-rw-r--r--fs/autofs4/dev-ioctl.c57
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/init.c10
-rw-r--r--fs/autofs4/inode.c52
-rw-r--r--fs/autofs4/root.c165
-rw-r--r--fs/autofs4/symlink.c11
-rw-r--r--fs/autofs4/waitq.c78
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/cachefiles/daemon.c13
-rw-r--r--fs/cachefiles/interface.c11
-rw-r--r--fs/cachefiles/internal.h4
-rw-r--r--fs/cachefiles/namei.c28
-rw-r--r--fs/ceph/addr.c328
-rw-r--r--fs/ceph/caps.c38
-rw-r--r--fs/ceph/dir.c69
-rw-r--r--fs/ceph/export.c13
-rw-r--r--fs/ceph/file.c15
-rw-r--r--fs/ceph/inode.c55
-rw-r--r--fs/ceph/mds_client.c23
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/snap.c16
-rw-r--r--fs/ceph/super.c47
-rw-r--r--fs/ceph/super.h24
-rw-r--r--fs/ceph/xattr.c78
-rw-r--r--fs/cifs/cifs_debug.c56
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifsencrypt.c32
-rw-r--r--fs/cifs/cifsfs.c11
-rw-r--r--fs/cifs/cifsfs.h12
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/cifssmb.c21
-rw-r--r--fs/cifs/smb2pdu.c24
-rw-r--r--fs/cifs/smbencrypt.c26
-rw-r--r--fs/compat_ioctl.c22
-rw-r--r--fs/configfs/dir.c53
-rw-r--r--fs/configfs/inode.c20
-rw-r--r--fs/configfs/item.c1
-rw-r--r--fs/coredump.c30
-rw-r--r--fs/crypto/Kconfig18
-rw-r--r--fs/crypto/Makefile3
-rw-r--r--fs/crypto/crypto.c555
-rw-r--r--fs/crypto/fname.c (renamed from fs/f2fs/crypto_fname.c)276
-rw-r--r--fs/crypto/keyinfo.c272
-rw-r--r--fs/crypto/policy.c229
-rw-r--r--fs/dax.c18
-rw-r--r--fs/dcache.c197
-rw-r--r--fs/direct-io.c12
-rw-r--r--fs/dlm/config.c41
-rw-r--r--fs/dlm/lowcomms.c74
-rw-r--r--fs/ecryptfs/crypto.c134
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h13
-rw-r--r--fs/ecryptfs/inode.c12
-rw-r--r--fs/ecryptfs/keystore.c224
-rw-r--r--fs/ecryptfs/main.c1
-rw-r--r--fs/ecryptfs/mmap.c1
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/eventfd.c42
-rw-r--r--fs/eventpoll.c2
-rw-r--r--fs/exec.c100
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/super.c25
-rw-r--r--fs/ext2/xattr.c139
-rw-r--r--fs/ext2/xattr.h21
-rw-r--r--fs/ext4/crypto.c24
-rw-r--r--fs/ext4/crypto_fname.c32
-rw-r--r--fs/ext4/crypto_key.c42
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h75
-rw-r--r--fs/ext4/ext4_crypto.h2
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/extents.c128
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/file.c129
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c29
-rw-r--r--fs/ext4/inline.c8
-rw-r--r--fs/ext4/inode.c402
-rw-r--r--fs/ext4/mballoc.c81
-rw-r--r--fs/ext4/mballoc.h12
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c34
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/page-io.c14
-rw-r--r--fs/ext4/super.c39
-rw-r--r--fs/ext4/xattr.c166
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/checkpoint.c77
-rw-r--r--fs/f2fs/crypto.c491
-rw-r--r--fs/f2fs/crypto_key.c254
-rw-r--r--fs/f2fs/crypto_policy.c209
-rw-r--r--fs/f2fs/data.c428
-rw-r--r--fs/f2fs/dir.c94
-rw-r--r--fs/f2fs/extent_cache.c176
-rw-r--r--fs/f2fs/f2fs.h315
-rw-r--r--fs/f2fs/f2fs_crypto.h151
-rw-r--r--fs/f2fs/file.c114
-rw-r--r--fs/f2fs/gc.c245
-rw-r--r--fs/f2fs/inline.c43
-rw-r--r--fs/f2fs/inode.c15
-rw-r--r--fs/f2fs/namei.c168
-rw-r--r--fs/f2fs/node.c223
-rw-r--r--fs/f2fs/node.h26
-rw-r--r--fs/f2fs/recovery.c14
-rw-r--r--fs/f2fs/segment.c386
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c204
-rw-r--r--fs/f2fs/trace.c6
-rw-r--r--fs/f2fs/xattr.c6
-rw-r--r--fs/f2fs/xattr.h3
-rw-r--r--fs/fat/Kconfig18
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/fs-writeback.c91
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/file.c56
-rw-r--r--fs/fuse/fuse_i.h9
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/dir.c6
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/glock.c10
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c71
-rw-r--r--fs/gfs2/inode.h5
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/jbd2/commit.c49
-rw-r--r--fs/jbd2/journal.c43
-rw-r--r--fs/jbd2/recovery.c31
-rw-r--r--fs/jbd2/revoke.c60
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/README.Locking5
-rw-r--r--fs/jffs2/build.c75
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/jffs2/file.c39
-rw-r--r--fs/jffs2/gc.c81
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/nodelist.h6
-rw-r--r--fs/jffs2/nodemgmt.c4
-rw-r--r--fs/jffs2/wbuf.c6
-rw-r--r--fs/kernfs/dir.c210
-rw-r--r--fs/kernfs/mount.c69
-rw-r--r--fs/mbcache.c1093
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c311
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c72
-rw-r--r--fs/nfs/blocklayout/blocklayout.h14
-rw-r--r--fs/nfs/blocklayout/dev.c144
-rw-r--r--fs/nfs/blocklayout/extent_tree.c44
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c69
-rw-r--r--fs/nfs/callback_xdr.c12
-rw-r--r--fs/nfs/dir.c12
-rw-r--r--fs/nfs/file.c12
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/nfs4file.c33
-rw-r--r--fs/nfs/nfs4proc.c76
-rw-r--r--fs/nfs/nfs4session.c54
-rw-r--r--fs/nfs/nfs4session.h8
-rw-r--r--fs/nfs/pnfs_nfs.c16
-rw-r--r--fs/nfsd/Kconfig28
-rw-r--r--fs/nfsd/Makefile4
-rw-r--r--fs/nfsd/blocklayout.c298
-rw-r--r--fs/nfsd/blocklayoutxdr.c77
-rw-r--r--fs/nfsd/blocklayoutxdr.h14
-rw-r--r--fs/nfsd/nfs3proc.c7
-rw-r--r--fs/nfsd/nfs4layouts.c31
-rw-r--r--fs/nfsd/nfs4proc.c9
-rw-r--r--fs/nfsd/nfs4recover.c29
-rw-r--r--fs/nfsd/nfs4state.c29
-rw-r--r--fs/nfsd/nfs4xdr.c26
-rw-r--r--fs/nfsd/pnfs.h8
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nfsd/vfs.h19
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c105
-rw-r--r--fs/ocfs2/aops.c1141
-rw-r--r--fs/ocfs2/aops.h19
-rw-r--r--fs/ocfs2/cluster/heartbeat.c14
-rw-r--r--fs/ocfs2/cluster/nodemanager.c22
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h26
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c13
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c127
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c41
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/file.c165
-rw-r--r--fs/ocfs2/filecheck.c606
-rw-r--r--fs/ocfs2/filecheck.h49
-rw-r--r--fs/ocfs2/inode.c228
-rw-r--r--fs/ocfs2/inode.h9
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/mmap.c8
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h20
-rw-r--r--fs/ocfs2/quota_global.c27
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/super.c49
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/open.c6
-rw-r--r--fs/orangefs/Kconfig6
-rw-r--r--fs/orangefs/Makefile10
-rw-r--r--fs/orangefs/acl.c175
-rw-r--r--fs/orangefs/dcache.c138
-rw-r--r--fs/orangefs/devorangefs-req.c943
-rw-r--r--fs/orangefs/dir.c398
-rw-r--r--fs/orangefs/downcall.h133
-rw-r--r--fs/orangefs/file.c717
-rw-r--r--fs/orangefs/inode.c475
-rw-r--r--fs/orangefs/namei.c462
-rw-r--r--fs/orangefs/orangefs-bufmap.c556
-rw-r--r--fs/orangefs/orangefs-bufmap.h36
-rw-r--r--fs/orangefs/orangefs-cache.c161
-rw-r--r--fs/orangefs/orangefs-debug.h92
-rw-r--r--fs/orangefs/orangefs-debugfs.c455
-rw-r--r--fs/orangefs/orangefs-debugfs.h3
-rw-r--r--fs/orangefs/orangefs-dev-proto.h62
-rw-r--r--fs/orangefs/orangefs-kernel.h623
-rw-r--r--fs/orangefs/orangefs-mod.c293
-rw-r--r--fs/orangefs/orangefs-sysfs.c1772
-rw-r--r--fs/orangefs/orangefs-sysfs.h2
-rw-r--r--fs/orangefs/orangefs-utils.c1048
-rw-r--r--fs/orangefs/protocol.h452
-rw-r--r--fs/orangefs/super.c559
-rw-r--r--fs/orangefs/symlink.c19
-rw-r--r--fs/orangefs/upcall.h246
-rw-r--r--fs/orangefs/waitqueue.c357
-rw-r--r--fs/orangefs/xattr.c545
-rw-r--r--fs/overlayfs/copy_up.c35
-rw-r--r--fs/overlayfs/dir.c71
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/readdir.c53
-rw-r--r--fs/overlayfs/super.c31
-rw-r--r--fs/proc/base.c71
-rw-r--r--fs/proc/meminfo.c31
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/task_mmu.c14
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/ram.c4
-rw-r--r--fs/quota/dquot.c58
-rw-r--r--fs/quota/quota.c70
-rw-r--r--fs/quota/quota_tree.c67
-rw-r--r--fs/quota/quota_v2.c6
-rw-r--r--fs/read_write.c197
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/select.c8
-rw-r--r--fs/splice.c5
-rw-r--r--fs/super.c1
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/misc.c57
-rw-r--r--fs/ubifs/ubifs.h41
-rw-r--r--fs/ubifs/xattr.c1
-rw-r--r--fs/udf/dir.c13
-rw-r--r--fs/udf/namei.c29
-rw-r--r--fs/udf/super.c38
-rw-r--r--fs/udf/udfdecl.h21
-rw-r--r--fs/udf/unicode.c630
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c172
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h16
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c12
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c170
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h38
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h19
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h3
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c32
-rw-r--r--fs/xfs/libxfs/xfs_sb.h1
-rw-r--r--fs/xfs/libxfs/xfs_shared.h1
-rw-r--r--fs/xfs/xfs_aops.c1027
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_attr_list.c19
-rw-r--r--fs/xfs/xfs_bmap_util.c8
-rw-r--r--fs/xfs/xfs_buf.c2
-rw-r--r--fs/xfs/xfs_buf.h26
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c129
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c88
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.h1
-rw-r--r--fs/xfs/xfs_icache.c43
-rw-r--r--fs/xfs/xfs_inode.c174
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c82
-rw-r--r--fs/xfs/xfs_ioctl.c121
-rw-r--r--fs/xfs/xfs_iops.c59
-rw-r--r--fs/xfs/xfs_itable.c22
-rw-r--r--fs/xfs/xfs_log.c152
-rw-r--r--fs/xfs/xfs_log_recover.c368
-rw-r--r--fs/xfs/xfs_mount.c24
-rw-r--r--fs/xfs/xfs_mount.h31
-rw-r--r--fs/xfs/xfs_ondisk.h117
-rw-r--r--fs/xfs/xfs_pnfs.h2
-rw-r--r--fs/xfs/xfs_qm.c55
-rw-r--r--fs/xfs/xfs_qm.h48
-rw-r--r--fs/xfs/xfs_qm_syscalls.c27
-rw-r--r--fs/xfs/xfs_quotaops.c36
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_super.c528
-rw-r--r--fs/xfs/xfs_super.h4
-rw-r--r--fs/xfs/xfs_sysfs.c78
-rw-r--r--fs/xfs/xfs_trace.h9
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c5
-rw-r--r--fs/xfs/xfs_trans_buf.c10
-rw-r--r--fs/xfs/xfs_trans_dquot.c15
-rw-r--r--fs/xfs/xfs_trans_inode.c14
338 files changed, 22770 insertions, 8779 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9adee0d7536e..6725f59c18e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -84,6 +84,8 @@ config MANDATORY_FILE_LOCKING
To the best of my knowledge this is dead code that no one cares about.
+source "fs/crypto/Kconfig"
+
source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
@@ -207,6 +209,7 @@ menuconfig MISC_FILESYSTEMS
if MISC_FILESYSTEMS
+source "fs/orangefs/Kconfig"
source "fs/adfs/Kconfig"
source "fs/affs/Kconfig"
source "fs/ecryptfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 79f522575cba..85b6e13b62d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
+obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -105,6 +106,7 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
obj-$(CONFIG_OVERLAY_FS) += overlayfs/
+obj-$(CONFIG_ORANGEFS_FS) += orangefs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
-/* -*- c -*- ------------------------------------------------------------- *
- *
- * linux/fs/autofs/autofs_i.h
- *
- * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
/* Internal header file for autofs */
@@ -35,28 +31,23 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* #define DEBUG */
-#define DPRINTK(fmt, ...) \
- pr_debug("pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_WARN(fmt, ...) \
- printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_ERROR(fmt, ...) \
- printk(KERN_ERR "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-/* Unified info structure. This is pointed to by both the dentry and
- inode structures. Each file in the filesystem has an instance of this
- structure. It holds a reference to the dentry, so dentries are never
- flushed while the file exists. All name lookups are dealt with at the
- dentry level, although the filesystem can interfere in the validation
- process. Readdir is implemented by traversing the dentry lists. */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+
+/*
+ * Unified info structure. This is pointed to by both the dentry and
+ * inode structures. Each file in the filesystem has an instance of this
+ * structure. It holds a reference to the dentry, so dentries are never
+ * flushed while the file exists. All name lookups are dealt with at the
+ * dentry level, although the filesystem can interfere in the validation
+ * process. Readdir is implemented by traversing the dentry lists.
+ */
struct autofs_info {
struct dentry *dentry;
struct inode *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
kgid_t gid;
};
-#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
* for expiry, so RCU_walk is
* not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
}
/* autofs4_oz_mode(): do we see the man behind the curtain? (The
- processes which do manipulations for us in user space sees the raw
- filesystem without "magic".) */
-
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+ * processes which do manipulations for us in user space sees the raw
+ * filesystem without "magic".)
+ */
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
int is_autofs4_dentry(struct dentry *);
int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *,
- struct autofs_packet_expire __user *);
+ struct autofs_sb_info *,
+ struct autofs_packet_expire __user *);
int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int when);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *, int __user *);
+ struct autofs_sb_info *, int __user *);
struct dentry *autofs4_expire_direct(struct super_block *sb,
struct vfsmount *mnt,
struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
}
- return;
}
static inline void autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static inline void autofs4_del_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!list_empty(&ino->expiring))
list_del_init(&ino->expiring);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
{
int err = 0;
- if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
- (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
- AUTOFS_WARN("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)",
- AUTOFS_DEV_IOCTL_VERSION_MAJOR,
- AUTOFS_DEV_IOCTL_VERSION_MINOR,
- param->ver_major, param->ver_minor, cmd);
+ if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
+ (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
+ pr_warn("ioctl control interface version mismatch: "
+ "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+ AUTOFS_DEV_IOCTL_VERSION_MINOR,
+ param->ver_major, param->ver_minor, cmd);
err = -EINVAL;
}
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
* Copy parameter control struct, including a possible path allocated
* at the end of the struct.
*/
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+ copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
{
kfree(param);
- return;
}
/*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
err = check_dev_ioctl_version(cmd, param);
if (err) {
- AUTOFS_WARN("invalid device control module version "
- "supplied for cmd(0x%08x)", cmd);
+ pr_warn("invalid device control module version "
+ "supplied for cmd(0x%08x)\n", cmd);
goto out;
}
if (param->size > sizeof(*param)) {
err = invalid_str(param->path, param->size - sizeof(*param));
if (err) {
- AUTOFS_WARN(
- "path string terminator missing for cmd(0x%08x)",
+ pr_warn(
+ "path string terminator missing for cmd(0x%08x)\n",
cmd);
goto out;
}
err = check_name(param->path);
if (err) {
- AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
- cmd);
+ pr_warn("invalid path supplied for cmd(0x%08x)\n",
+ cmd);
goto out;
}
}
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
void *data)
{
struct path path;
- int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+ int err;
+
+ err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
if (err)
return err;
err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
static int test_by_type(struct path *path, void *p)
{
struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+
return ino && ino->sbi->type & *(unsigned *)p;
}
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
new_pid = get_task_pid(current, PIDTYPE_PGID);
if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
- AUTOFS_WARN("Not allowed to change PID namespace");
+ pr_warn("not allowed to change PID namespace\n");
err = -EINVAL;
goto out;
}
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
err = 0;
autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
- param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
- param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+ param->requester.uid =
+ from_kuid_munged(current_user_ns(), ino->uid);
+ param->requester.gid =
+ from_kgid_munged(current_user_ns(), ino->gid);
spin_unlock(&sbi->fs_lock);
}
path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
}
/* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+ struct autofs_dev_ioctl __user *user)
{
struct autofs_dev_ioctl *param;
struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
fn = lookup_dev_ioctl(cmd);
if (!fn) {
- AUTOFS_WARN("unknown command 0x%08x", command);
+ pr_warn("unknown command 0x%08x\n", command);
return -ENOTTY;
}
@@ -711,6 +717,7 @@ out:
static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
{
int err;
+
err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
return (long) err;
}
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
- .name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .name = AUTOFS_DEVICE_NAME,
+ .fops = &_dev_ioctl_fops
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
r = misc_register(&_autofs_dev_ioctl_misc);
if (r) {
- AUTOFS_ERROR("misc_register failed for control device");
+ pr_err("misc_register failed for control device\n");
return r;
}
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
void autofs_dev_ioctl_exit(void)
{
misc_deregister(&_autofs_dev_ioctl_misc);
- return;
}
-
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/expire.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,7 +14,7 @@ static unsigned long now;
/* Check if a dentry can be expired */
static inline int autofs4_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+ unsigned long timeout, int do_now)
{
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
struct path path = {.mnt = mnt, .dentry = dentry};
int status = 1;
- DPRINTK("dentry %p %pd", dentry, dentry);
+ pr_debug("dentry %p %pd\n", dentry, dentry);
path_get(&path);
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
/* Update the expiry counter if fs is busy */
if (!may_umount_tree(path.mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
}
status = 0;
done:
- DPRINTK("returning = %d", status);
+ pr_debug("returning = %d\n", status);
path_put(&path);
return status;
}
@@ -74,7 +72,7 @@ done:
* Calculate and dget next entry in the subdirs list under root.
*/
static struct dentry *get_next_positive_subdir(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -121,7 +119,7 @@ cont:
* Calculate and dget next entry in top down tree traversal.
*/
static struct dentry *get_next_positive_dentry(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -187,15 +185,17 @@ again:
* autofs submounts.
*/
static int autofs4_direct_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
if (ino)
ino->last_used = jiffies;
return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
return 0;
}
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
* The tree is not busy iff no mountpoints are busy
*/
static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *top_ino = autofs4_dentry_ino(top);
struct dentry *p;
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* Negative dentry - give up */
if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
p = NULL;
while ((p = get_next_positive_dentry(p, top))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
/*
* Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
{
struct dentry *p;
- DPRINTK("parent %p %pd", parent, parent);
+ pr_debug("parent %p %pd\n", parent, parent);
p = NULL;
while ((p = get_next_positive_dentry(p, parent))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
if (d_mountpoint(p)) {
/* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
* offset (autofs-5.0+).
*/
if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+ pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
- DPRINTK("checking symlink %p %pd", dentry, dentry);
+ pr_debug("checking symlink %p %pd\n", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
} else {
/* Path walk currently on this dentry? */
struct dentry *expired;
+
ino_count = atomic_read(&ino->count) + 1;
if (d_count(dentry) > ino_count)
return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
return NULL;
found:
- DPRINTK("returning %p %pd", expired, expired);
+ pr_debug("returning %p %pd\n", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
smp_mb();
ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
- DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+ pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
status = autofs4_wait(sbi, dentry, NFY_NONE);
wait_for_completion(&ino->expire_complete);
- DPRINTK("expire done status=%d", status);
+ pr_debug("expire done status=%d\n", status);
if (d_unhashed(dentry))
return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
/* Perform an expiry operation */
int autofs4_expire_run(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- struct autofs_packet_expire __user *pkt_p)
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
struct autofs_info *ino;
struct dentry *dentry;
int ret = 0;
- memset(&pkt,0,sizeof pkt);
+ memset(&pkt, 0, sizeof(pkt));
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = autofs_ptype_expire;
- if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+ dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+ if (!dentry)
return -EAGAIN;
pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
pkt.name[pkt.len] = '\0';
dput(dentry);
- if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+ if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_info *ino = autofs4_dentry_ino(dentry);
/* This is synchronous because it makes the daemon a
- little easier */
+ * little easier
+ */
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
return ret;
}
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
- more to be done */
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/module.h>
#include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -24,7 +20,9 @@
struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
{
- struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+ struct autofs_info *ino;
+
+ ino = kzalloc(sizeof(*ino), GFP_KERNEL);
if (ino) {
INIT_LIST_HEAD(&ino->active);
INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
put_pid(sbi->oz_pgrp);
}
- DPRINTK("shutting down");
+ pr_debug("shutting down\n");
kill_litter_super(sb);
if (sbi)
kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
-
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (sbi->pipe)
+ seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+ else
+ seq_printf(m, ",pipe_ino=-1");
+#endif
return 0;
}
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
int autofs4_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * root_inode;
- struct dentry * root;
- struct file * pipe;
+ struct inode *root_inode;
+ struct dentry *root;
+ struct file *pipe;
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
- DPRINTK("starting up, sbi = %p",sbi);
+ pr_debug("starting up, sbi = %p\n", sbi);
s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
&pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
&sbi->max_proto)) {
- printk("autofs: called with bogus options\n");
+ pr_err("called with bogus options\n");
goto fail_dput;
}
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
- pr_warn("autofs: could not find process group %d\n",
+ pr_err("could not find process group %d\n",
pgrp);
goto fail_dput;
}
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Couldn't this be tested earlier? */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- printk("autofs: kernel does not match daemon version "
+ pr_err("kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
goto fail_dput;
}
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+ pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
- printk("autofs: could not open pipe file descriptor\n");
+ pr_err("could not open pipe file descriptor\n");
goto fail_dput;
}
ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
*/
s->s_root = root;
return 0;
-
+
/*
* Failure ... clean up.
*/
fail_fput:
- printk("autofs: pipe file descriptor does not contain proper ops\n");
+ pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
/* fall through */
fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..7ab923940d18 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/capability.h>
#include <linux/errno.h>
@@ -23,16 +19,18 @@
#include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+ unsigned int, unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+ struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
static int autofs4_d_manage(struct dentry *, bool);
static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
static void autofs4_add_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
ino->active_count++;
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static void autofs4_del_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
}
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+ pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
struct autofs_info *ino = autofs4_dentry_ino(de);
struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
- DPRINTK("releasing %p", de);
+ pr_debug("releasing %p\n", de);
if (!ino)
return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- DPRINTK("waiting for mount name=%pd", dentry);
+ pr_debug("waiting for mount name=%pd\n", dentry);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
- DPRINTK("mount wait done status=%d", status);
+ pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
struct autofs_info *ino;
- struct dentry *new = d_lookup(parent, &dentry->d_name);
+ struct dentry *new;
+
+ new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never triggers a mount. */
if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* a mount-trap.
*/
struct inode *inode;
+
if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
return 0;
if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
struct dentry *active;
- DPRINTK("name = %pd", dentry);
+ pr_debug("name = %pd\n", dentry);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
sbi = autofs4_sbi(dir->i_sb);
- DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
- autofs4_oz_mode(sbi));
+ pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
+ current->pid, task_pgrp_nr(current), sbi->catatonic,
+ autofs4_oz_mode(sbi));
active = autofs4_lookup_active(dentry);
- if (active) {
+ if (active)
return active;
- } else {
+ else {
/*
* A dentry that is not within the root can never trigger a
* mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
return ERR_PTR(-ENOENT);
/* Mark entries in the root as mount triggers */
- if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+ if (IS_ROOT(dentry->d_parent) &&
+ autofs_type_indirect(sbi->type))
__managed_dentry_set_managed(dentry);
ino = autofs4_new_ino(sbi);
@@ -537,8 +542,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
ino->dentry = dentry;
autofs4_add_active(dentry);
-
- d_instantiate(dentry, NULL);
}
return NULL;
}
@@ -554,7 +557,7 @@ static int autofs4_dir_symlink(struct inode *dir,
size_t size = strlen(symname);
char *cp;
- DPRINTK("%s <- %pd", symname, dentry);
+ pr_debug("%s <- %pd\n", symname, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -613,7 +616,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
+
/* This allows root to remove symlinks */
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -664,7 +667,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
if (IS_ROOT(parent->d_parent))
return;
managed_dentry_clear_managed(parent);
- return;
}
static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +689,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
if (d_child->next == &parent->d_subdirs &&
d_child->prev == &parent->d_subdirs)
managed_dentry_set_managed(parent);
- return;
}
static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +696,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
- DPRINTK("dentry %p, removing %pd", dentry, dentry);
+
+ pr_debug("dentry %p, removing %pd\n", dentry, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -728,7 +729,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +740,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
if (!autofs4_oz_mode(sbi))
return -EACCES;
- DPRINTK("dentry %p, creating %pd", dentry, dentry);
+ pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -768,14 +770,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
/* Get/set timeout ioctl() operation */
#ifdef CONFIG_COMPAT
static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
- compat_ulong_t __user *p)
+ compat_ulong_t __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
+
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > UINT_MAX/HZ)
sbi->exp_timeout = 0;
@@ -783,18 +789,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
#endif
static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
- unsigned long __user *p)
+ unsigned long __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
+
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > ULONG_MAX/HZ)
sbi->exp_timeout = 0;
@@ -802,16 +814,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
/* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->version, p);
}
/* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->sub_version, p);
}
@@ -826,7 +842,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
if (may_umount(mnt))
status = 1;
- DPRINTK("returning %d", status);
+ pr_debug("returning %d\n", status);
status = put_user(status, p);
@@ -834,9 +850,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
}
/* Identify autofs4_dentries - this is so we can tell if there's
- an extra dentry refcount or not. We only hold a refcount on the
- dentry if its non-negative (ie, d_inode != NULL)
-*/
+ * an extra dentry refcount or not. We only hold a refcount on the
+ * dentry if its non-negative (ie, d_inode != NULL)
+ */
int is_autofs4_dentry(struct dentry *dentry)
{
return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +870,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
void __user *p = (void __user *)arg;
- DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
- cmd,arg,sbi,task_pgrp_nr(current));
+ pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
+ cmd, arg, sbi, task_pgrp_nr(current));
if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
-
+
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
-
- switch(cmd) {
+
+ switch (cmd) {
case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
autofs4_catatonic_mode(sbi);
return 0;
@@ -888,13 +904,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
/* return a single thing to expire */
case AUTOFS_IOC_EXPIRE:
- return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_run(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
/* same as above, but can send multiple expires through pipe */
case AUTOFS_IOC_EXPIRE_MULTI:
- return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_multi(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
default:
- return -ENOSYS;
+ return -EINVAL;
}
}
@@ -902,12 +920,13 @@ static long autofs4_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+
return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
static long autofs4_root_compat_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg)
+ unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
int ret;
@@ -916,7 +935,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
else
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
- (unsigned long)compat_ptr(arg));
+ (unsigned long) compat_ptr(arg));
return ret;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+
if (!dentry)
return ERR_PTR(-ECHILD);
sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/slab.h>
#include <linux/time.h>
@@ -18,7 +14,8 @@
#include "autofs_i.h"
/* We make this a static variable rather than a part of the superblock; it
- is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
static autofs_wqt_t autofs4_next_wait_queue = 1;
/* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
return;
}
- DPRINTK("entering catatonic mode");
+ pr_debug("entering catatonic mode\n");
sbi->catatonic = 1;
wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
set_fs(KERNEL_DS);
mutex_lock(&sbi->pipe_mutex);
- while (bytes &&
- (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
+ while (bytes && wr) {
data += wr;
bytes -= wr;
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
/* Keep the currently executing process from receiving a
- SIGPIPE unless it was already supposed to get one */
+ * SIGPIPE unless it was already supposed to get one
+ */
if (wr == -EPIPE && !sigpipe) {
spin_lock_irqsave(&current->sighand->siglock, flags);
sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
return (bytes > 0);
}
-
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct file *pipe = NULL;
size_t pktsz;
- DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+ pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
+ (unsigned long) wq->wait_queue_token,
+ wq->name.len, wq->name.name, type);
- memset(&pkt,0,sizeof pkt); /* For security reasons */
+ memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
}
case autofs_ptype_expire_multi:
{
- struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+ struct autofs_packet_expire_multi *ep =
+ &pkt.v4_pkt.expire_multi;
pktsz = sizeof(*ep);
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
break;
}
default:
- printk("autofs4_notify_daemon: bad type %d!\n", type);
+ pr_warn("bad type %d!\n", type);
mutex_unlock(&sbi->wq_mutex);
return;
}
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
if (wq->name.hash == qstr->hash &&
wq->name.len == qstr->len &&
wq->name.name &&
- !memcmp(wq->name.name, qstr->name, qstr->len))
+ !memcmp(wq->name.name, qstr->name, qstr->len))
break;
}
return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
struct qstr *qstr,
- struct dentry*dentry, enum autofs_notify notify)
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
* continue on and create a new request.
*/
if (!IS_ROOT(dentry)) {
- if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+ if (d_unhashed(dentry) &&
+ d_really_is_positive(dentry)) {
struct dentry *parent = dentry->d_parent;
+
new = d_lookup(parent, &dentry->d_name);
if (new)
dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
return 1;
}
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
- enum autofs_notify notify)
+int autofs4_wait(struct autofs_sb_info *sbi,
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
if (!wq) {
/* Create a new wait queue */
- wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
kfree(qstr.name);
mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
autofs_ptype_expire_indirect;
}
- DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
- /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+ /*
+ * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+ */
autofs4_notify_daemon(sbi, wq, type);
} else {
wq->wait_ctr++;
- DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
}
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
*/
if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
- sigset_t oldset;
+ unsigned long shutdown_sigs_mask;
unsigned long irqflags;
+ sigset_t oldset;
spin_lock_irqsave(&current->sighand->siglock, irqflags);
oldset = current->blocked;
- siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+ shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
+ siginitsetinv(&current->blocked, shutdown_sigs_mask);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
} else {
- DPRINTK("skipped sleeping");
+ pr_debug("skipped sleeping\n");
}
status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
return 0;
}
-
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 826b164a4b5b..3172c4e2f502 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -575,7 +575,11 @@ static const struct super_operations bdev_sops = {
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+ struct dentry *dent;
+ dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+ if (dent)
+ dent->d_sb->s_iflags |= SB_I_CGROUPWB;
+ return dent;
}
static struct file_system_type bd_type = {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f6705252b5b2..d01f89d130e0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -938,7 +938,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
+ if (static_cpu_has(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a25f3b21491b..9fcd6dfc3266 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ if (err == -EEXIST)
+ err = 0;
if (err) {
- BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d39f714dabeb..f54bf450bad3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -137,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void **slot;
spin_lock(&fs_info->buffer_lock);
-restart:
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
@@ -147,7 +146,7 @@ restart:
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
continue;
}
spin_unlock(&fs_info->buffer_lock);
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632abb4ca9..33be29675358 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
*
- * The caller must hold mem_cgroup_begin_page_stat() lock.
+ * The caller must hold lock_page_memcg().
*/
static void __set_page_dirty(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, int warn)
+ int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
@@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
- struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, memcg, 1);
+ __set_page_dirty(page, mapping, 1);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
- struct mem_cgroup *memcg;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, memcg, 0);
+ __set_page_dirty(page, mapping, 0);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 452e98dd7560..1ee54ffd3a24 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -162,6 +162,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
size_t buflen, loff_t *pos)
{
struct cachefiles_cache *cache = file->private_data;
+ unsigned long long b_released;
+ unsigned f_released;
char buffer[256];
int n;
@@ -174,6 +176,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
cachefiles_has_space(cache, 0, 0);
/* summarise */
+ f_released = atomic_xchg(&cache->f_released, 0);
+ b_released = atomic_long_xchg(&cache->b_released, 0);
clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
n = snprintf(buffer, sizeof(buffer),
@@ -183,15 +187,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
" fstop=%llx"
" brun=%llx"
" bcull=%llx"
- " bstop=%llx",
+ " bstop=%llx"
+ " freleased=%x"
+ " breleased=%llx",
test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
(unsigned long long) cache->frun,
(unsigned long long) cache->fcull,
(unsigned long long) cache->fstop,
(unsigned long long) cache->brun,
(unsigned long long) cache->bcull,
- (unsigned long long) cache->bstop
- );
+ (unsigned long long) cache->bstop,
+ f_released,
+ b_released);
if (n > buflen)
return -EMSGSIZE;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 675a3332d72f..861d611b8c05 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -291,15 +291,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
}
/* note that the object is now inactive */
- if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
- write_lock(&cache->active_lock);
- if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
- &object->flags))
- BUG();
- rb_erase(&object->active_node, &cache->active_nodes);
- wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
- write_unlock(&cache->active_lock);
- }
+ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+ cachefiles_mark_object_inactive(cache, object);
dput(object->dentry);
object->dentry = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9c4b737a54df..2fcde1a34b7c 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -66,6 +66,8 @@ struct cachefiles_cache {
struct rb_root active_nodes; /* active nodes (can't be culled) */
rwlock_t active_lock; /* lock for active_nodes */
atomic_t gravecounter; /* graveyard uniquifier */
+ atomic_t f_released; /* number of objects released lately */
+ atomic_long_t b_released; /* number of blocks released lately */
unsigned frun_percent; /* when to stop culling (% files) */
unsigned fcull_percent; /* when to start culling (% files) */
unsigned fstop_percent; /* when to stop allocating (% files) */
@@ -157,6 +159,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
/*
* namei.c
*/
+extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
+ struct cachefiles_object *object);
extern int cachefiles_delete_object(struct cachefiles_cache *cache,
struct cachefiles_object *object);
extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1c2334c163dd..4ae75006e73b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -258,6 +258,28 @@ requeue:
}
/*
+ * Mark an object as being inactive.
+ */
+void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
+ struct cachefiles_object *object)
+{
+ write_lock(&cache->active_lock);
+ rb_erase(&object->active_node, &cache->active_nodes);
+ clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+ write_unlock(&cache->active_lock);
+
+ wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+
+ /* This object can now be culled, so we need to let the daemon know
+ * that there is something it can remove if it needs to.
+ */
+ atomic_long_add(d_backing_inode(object->dentry)->i_blocks,
+ &cache->b_released);
+ if (atomic_inc_return(&cache->f_released))
+ cachefiles_state_changed(cache);
+}
+
+/*
* delete an object representation from the cache
* - file backed objects are unlinked
* - directory backed objects are stuffed into the graveyard for userspace to
@@ -684,11 +706,7 @@ mark_active_timed_out:
check_error:
_debug("check error %d", ret);
- write_lock(&cache->active_lock);
- rb_erase(&object->active_node, &cache->active_nodes);
- clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
- wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
- write_unlock(&cache->active_lock);
+ cachefiles_mark_object_inactive(cache, object);
release_dentry:
dput(object->dentry);
object->dentry = NULL;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c22213789090..fc5cae2a0db2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
static int ceph_releasepage(struct page *page, gfp_t g)
{
- struct inode *inode = page->mapping ? page->mapping->host : NULL;
- dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+ dout("%p releasepage %p idx %lu\n", page->mapping->host,
+ page, page->index);
WARN_ON(PageDirty(page));
/* Can we release the page from the cache? */
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0 && rc != ENOENT)
+ if (rc < 0 && rc != -ENOENT)
goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_data *osd_data;
- unsigned wrote;
struct page *page;
- int num_pages;
- int i;
+ int num_pages, total_pages = 0;
+ int i, j;
+ int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- int rc = req->r_result;
- u64 bytes = req->r_ops[0].extent.length;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- long writeback_stat;
- unsigned issued = ceph_caps_issued(ci);
+ bool remove_page;
- osd_data = osd_req_op_extent_osd_data(req, 0);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
- num_pages = calc_pages_for((u64)osd_data->alignment,
- (u64)osd_data->length);
- if (rc >= 0) {
- /*
- * Assume we wrote the pages we originally sent. The
- * osd might reply with fewer pages if our writeback
- * raced with a truncation and was adjusted at the osd,
- * so don't believe the reply.
- */
- wrote = num_pages;
- } else {
- wrote = 0;
+
+ dout("writepages_finish %p rc %d\n", inode, rc);
+ if (rc < 0)
mapping_set_error(mapping, rc);
- }
- dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
- inode, rc, bytes, wrote);
- /* clean all pages */
- for (i = 0; i < num_pages; i++) {
- page = osd_data->pages[i];
- BUG_ON(!page);
- WARN_ON(!PageUptodate(page));
+ /*
+ * We lost the cache cap, need to truncate the page before
+ * it is unlocked, otherwise we'd truncate it later in the
+ * page truncation thread, possibly losing some data that
+ * raced its way in
+ */
+ remove_page = !(ceph_caps_issued(ci) &
+ (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
- writeback_stat =
- atomic_long_dec_return(&fsc->writeback_count);
- if (writeback_stat <
- CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
- BLK_RW_ASYNC);
+ /* clean all pages */
+ for (i = 0; i < req->r_num_ops; i++) {
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ break;
- ceph_put_snap_context(page_snap_context(page));
- page->private = 0;
- ClearPagePrivate(page);
- dout("unlocking %d %p\n", i, page);
- end_page_writeback(page);
+ osd_data = osd_req_op_extent_osd_data(req, i);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ total_pages += num_pages;
+ for (j = 0; j < num_pages; j++) {
+ page = osd_data->pages[j];
+ BUG_ON(!page);
+ WARN_ON(!PageUptodate(page));
+
+ if (atomic_long_dec_return(&fsc->writeback_count) <
+ CONGESTION_OFF_THRESH(
+ fsc->mount_options->congestion_kb))
+ clear_bdi_congested(&fsc->backing_dev_info,
+ BLK_RW_ASYNC);
+
+ ceph_put_snap_context(page_snap_context(page));
+ page->private = 0;
+ ClearPagePrivate(page);
+ dout("unlocking %p\n", page);
+ end_page_writeback(page);
+
+ if (remove_page)
+ generic_error_remove_page(inode->i_mapping,
+ page);
- /*
- * We lost the cache cap, need to truncate the page before
- * it is unlocked, otherwise we'd truncate it later in the
- * page truncation thread, possibly losing some data that
- * raced its way in
- */
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
- generic_error_remove_page(inode->i_mapping, page);
+ unlock_page(page);
+ }
+ dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
+ inode, osd_data->length, rc >= 0 ? num_pages : 0);
- unlock_page(page);
+ ceph_release_pages(osd_data->pages, num_pages);
}
- dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
- ceph_release_pages(osd_data->pages, num_pages);
+ ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
+
+ osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
mempool_free(osd_data->pages,
ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -778,17 +778,15 @@ retry:
while (!done && index <= end) {
unsigned i;
int first;
- pgoff_t next;
- int pvec_pages, locked_pages;
- struct page **pages = NULL;
+ pgoff_t strip_unit_end = 0;
+ int num_ops = 0, op_idx;
+ int pvec_pages, locked_pages = 0;
+ struct page **pages = NULL, **data_pages;
mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
int want;
- u64 offset, len;
- long writeback_stat;
+ u64 offset = 0, len = 0;
- next = 0;
- locked_pages = 0;
max_pages = max_pages_ever;
get_more_pages:
@@ -824,8 +822,8 @@ get_more_pages:
unlock_page(page);
break;
}
- if (next && (page->index != next)) {
- dout("not consecutive %p\n", page);
+ if (strip_unit_end && (page->index > strip_unit_end)) {
+ dout("end of strip unit %p\n", page);
unlock_page(page);
break;
}
@@ -867,36 +865,31 @@ get_more_pages:
/*
* We have something to write. If this is
* the first locked page this time through,
- * allocate an osd request and a page array
- * that it will use.
+ * calculate max possinle write size and
+ * allocate a page array
*/
if (locked_pages == 0) {
- BUG_ON(pages);
+ u64 objnum;
+ u64 objoff;
+
/* prepare async write request */
offset = (u64)page_offset(page);
len = wsize;
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0,
- do_sync ? 2 : 1,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, truncate_seq,
- truncate_size, true);
- if (IS_ERR(req)) {
- rc = PTR_ERR(req);
+
+ rc = ceph_calc_file_object_mapping(&ci->i_layout,
+ offset, len,
+ &objnum, &objoff,
+ &len);
+ if (rc < 0) {
unlock_page(page);
break;
}
- if (do_sync)
- osd_req_op_init(req, 1,
- CEPH_OSD_OP_STARTSYNC, 0);
-
- req->r_callback = writepages_finish;
- req->r_inode = inode;
+ num_ops = 1 + do_sync;
+ strip_unit_end = page->index +
+ ((len - 1) >> PAGE_CACHE_SHIFT);
+ BUG_ON(pages);
max_pages = calc_pages_for(0, (u64)len);
pages = kmalloc(max_pages * sizeof (*pages),
GFP_NOFS);
@@ -905,6 +898,20 @@ get_more_pages:
pages = mempool_alloc(pool, GFP_NOFS);
BUG_ON(!pages);
}
+
+ len = 0;
+ } else if (page->index !=
+ (offset + len) >> PAGE_CACHE_SHIFT) {
+ if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
+ CEPH_OSD_MAX_OPS)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ break;
+ }
+
+ num_ops++;
+ offset = (u64)page_offset(page);
+ len = 0;
}
/* note position of first page in pvec */
@@ -913,18 +920,16 @@ get_more_pages:
dout("%p will write page %p idx %lu\n",
inode, page, page->index);
- writeback_stat =
- atomic_long_inc_return(&fsc->writeback_count);
- if (writeback_stat > CONGESTION_ON_THRESH(
+ if (atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
set_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
}
- set_page_writeback(page);
pages[locked_pages] = page;
locked_pages++;
- next = page->index + 1;
+ len += PAGE_CACHE_SIZE;
}
/* did we get anything? */
@@ -944,38 +949,119 @@ get_more_pages:
/* shift unused pages over in the pvec... we
* will need to release them below. */
for (j = i; j < pvec_pages; j++) {
- dout(" pvec leftover page %p\n",
- pvec.pages[j]);
+ dout(" pvec leftover page %p\n", pvec.pages[j]);
pvec.pages[j-i+first] = pvec.pages[j];
}
pvec.nr -= i-first;
}
- /* Format the osd request message and submit the write */
+new_request:
offset = page_offset(pages[0]);
- len = (u64)locked_pages << PAGE_CACHE_SHIFT;
- if (snap_size == -1) {
- len = min(len, (u64)i_size_read(inode) - offset);
- /* writepages_finish() clears writeback pages
- * according to the data length, so make sure
- * data length covers all locked pages */
- len = max(len, 1 +
- ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
- } else {
- len = min(len, snap_size - offset);
+ len = wsize;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, num_ops,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, false);
+ if (IS_ERR(req)) {
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, true);
+ BUG_ON(IS_ERR(req));
}
- dout("writepages got %d pages at %llu~%llu\n",
- locked_pages, offset, len);
+ BUG_ON(len < page_offset(pages[locked_pages - 1]) +
+ PAGE_CACHE_SIZE - offset);
+
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ /* Format the osd request message and submit the write */
+ len = 0;
+ data_pages = pages;
+ op_idx = 0;
+ for (i = 0; i < locked_pages; i++) {
+ u64 cur_offset = page_offset(pages[i]);
+ if (offset + len != cur_offset) {
+ if (op_idx + do_sync + 1 == req->r_num_ops)
+ break;
+ osd_req_op_extent_dup_last(req, op_idx,
+ cur_offset - offset);
+ dout("writepages got pages at %llu~%llu\n",
+ offset, len);
+ osd_req_op_extent_osd_data_pages(req, op_idx,
+ data_pages, len, 0,
!!pool, false);
+ osd_req_op_extent_update(req, op_idx, len);
- pages = NULL; /* request message now owns the pages array */
- pool = NULL;
+ len = 0;
+ offset = cur_offset;
+ data_pages = pages + i;
+ op_idx++;
+ }
- /* Update the write op length in case we changed it */
+ set_page_writeback(pages[i]);
+ len += PAGE_CACHE_SIZE;
+ }
+
+ if (snap_size != -1) {
+ len = min(len, snap_size - offset);
+ } else if (i == locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - PAGE_CACHE_SIZE;
+ len = min(len, (u64)i_size_read(inode) - offset);
+ len = max(len, min_len);
+ }
+ dout("writepages got pages at %llu~%llu\n", offset, len);
+
+ osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
+ 0, !!pool, false);
+ osd_req_op_extent_update(req, op_idx, len);
+
+ if (do_sync) {
+ op_idx++;
+ osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
+ }
+ BUG_ON(op_idx + 1 != req->r_num_ops);
- osd_req_op_extent_update(req, 0, len);
+ pool = NULL;
+ if (i < locked_pages) {
+ BUG_ON(num_ops <= req->r_num_ops);
+ num_ops -= req->r_num_ops;
+ num_ops += do_sync;
+ locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ data_pages = pages;
+ pages = kmalloc(locked_pages * sizeof (*pages),
+ GFP_NOFS);
+ if (!pages) {
+ pool = fsc->wb_pagevec_pool;
+ pages = mempool_alloc(pool, GFP_NOFS);
+ BUG_ON(!pages);
+ }
+ memcpy(pages, data_pages + i,
+ locked_pages * sizeof(*pages));
+ memset(data_pages + i, 0,
+ locked_pages * sizeof(*pages));
+ } else {
+ BUG_ON(num_ops != req->r_num_ops);
+ index = pages[i - 1]->index + 1;
+ /* request message now owns the pages array */
+ pages = NULL;
+ }
vino = ceph_vino(inode);
ceph_osdc_build_request(req, offset, snapc, vino.snap,
@@ -985,9 +1071,10 @@ get_more_pages:
BUG_ON(rc);
req = NULL;
- /* continue? */
- index = next;
- wbc->nr_to_write -= locked_pages;
+ wbc->nr_to_write -= i;
+ if (pages)
+ goto new_request;
+
if (wbc->nr_to_write <= 0)
done = 1;
@@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ceph_empty_snapc, 0, 0, false);
+ NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ceph_empty_snapc,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out;
}
- rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
- ceph_empty_snapc,
+ rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS);
if (!rd_req) {
err = -ENOMEM;
@@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
"%llx.00000000", ci->i_vino.ino);
rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
- wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
- ceph_empty_snapc,
+ wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS);
if (!wr_req) {
err = -ENOMEM;
@@ -1756,6 +1840,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
u32 pool;
int ret, flags;
+ /* does not support pool namespace yet */
+ if (ci->i_pool_ns_len)
+ return -EIO;
+
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cdbf8cf3d52c..de17bb232ff8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- u64 time_warp_seq,
+ struct timespec *ctime, u64 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_timespec(&fc->mtime, mtime);
if (atime)
ceph_encode_timespec(&fc->atime, atime);
+ if (ctime)
+ ceph_encode_timespec(&fc->ctime, ctime);
fc->time_warp_seq = cpu_to_le32(time_warp_seq);
fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
int held, revoking, dropping, keep;
u64 seq, issue_seq, mseq, time_warp_seq, follows;
u64 size, max_size;
- struct timespec mtime, atime;
+ struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
kuid_t uid;
@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ci->i_requested_max_size = max_size;
mtime = inode->i_mtime;
atime = inode->i_atime;
+ ctime = inode->i_ctime;
time_warp_seq = ci->i_time_warp_seq;
uid = inode->i_uid;
gid = inode->i_gid;
@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
op, keep, want, flushing, seq,
flush_tid, oldest_flush_tid, issue_seq, mseq,
- size, max_size, &mtime, &atime, time_warp_seq,
+ size, max_size, &mtime, &atime, &ctime, time_warp_seq,
uid, gid, mode, xattr_version, xattr_blob,
follows, inline_data);
if (ret < 0) {
@@ -1320,7 +1323,7 @@ retry:
capsnap->dirty, 0, capsnap->flush_tid, 0,
0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime,
- capsnap->time_warp_seq,
+ &capsnap->ctime, capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
capsnap->xattr_version, capsnap->xattr_blob,
capsnap->follows, capsnap->inline_data);
@@ -2753,7 +2756,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued)
+ struct ceph_cap *cap, int issued,
+ u32 pool_ns_len)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2873,6 +2877,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
ci->i_layout = grant->layout;
+ ci->i_pool_ns_len = pool_ns_len;
+
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(grant->truncate_seq),
@@ -3411,6 +3417,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
+ u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3463,6 +3470,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 8) {
+ u64 flush_tid;
+ u32 caller_uid, caller_gid;
+ u32 osd_epoch_barrier;
+ /* version >= 5 */
+ ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+ /* version >= 6 */
+ ceph_decode_64_safe(&p, end, flush_tid, bad);
+ /* version >= 7 */
+ ceph_decode_32_safe(&p, end, caller_uid, bad);
+ ceph_decode_32_safe(&p, end, caller_gid, bad);
+ /* version >= 8 */
+ ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ }
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -3518,7 +3540,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
&cap, &issued);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3542,7 +3565,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
issued |= __ceph_caps_dirty(ci);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fd11fb231a2e..fadc243dfb28 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata)
return 0;
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
+ di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
return -ENOMEM; /* oh well */
@@ -68,23 +68,6 @@ out_unlock:
return 0;
}
-struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
-{
- struct inode *inode = NULL;
-
- if (!dentry)
- return NULL;
-
- spin_lock(&dentry->d_lock);
- if (!IS_ROOT(dentry)) {
- inode = d_inode(dentry->d_parent);
- ihold(inode);
- }
- spin_unlock(&dentry->d_lock);
- return inode;
-}
-
-
/*
* for readdir, we encode the directory frag and offset within that
* frag into f_pos.
@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int op;
+ int mask;
int err;
dout("lookup %p dentry %p '%pd'\n",
@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(req);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- /* we only need inode linkage */
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
req->r_locked_dir = dir;
err = ceph_mdsc_do_request(mdsc, NULL, req);
err = ceph_handle_snapdir(req, dentry, err);
@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int valid = 0;
+ struct dentry *parent;
struct inode *dir;
if (flags & LOOKUP_RCU)
@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
- dir = ceph_get_dentry_parent_inode(dentry);
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = 1;
}
+ if (!valid) {
+ struct ceph_mds_client *mdsc =
+ ceph_sb_to_client(dir->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ int op, mask, err;
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (!IS_ERR(req)) {
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = mask;
+
+ req->r_locked_dir = dir;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == 0 || err == -ENOENT) {
+ if (dentry == req->r_dentry) {
+ valid = !d_unhashed(dentry);
+ } else {
+ d_invalidate(req->r_dentry);
+ err = -EAGAIN;
+ }
+ }
+ ceph_mdsc_put_request(req);
+ dout("d_revalidate %p lookup result=%d\n",
+ dentry, err);
+ }
+ }
+
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
if (valid) {
ceph_dentry_lru_touch(dentry);
} else {
ceph_dir_clear_complete(dir);
}
- iput(dir);
+
+ dput(parent);
return valid;
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 3b3172357326..6e72c98162d5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
inode = ceph_find_inode(sb, vino);
if (!inode) {
struct ceph_mds_request *req;
+ int mask;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
req->r_ino1 = vino;
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb,
struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
+ int mask;
int err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb,
.snap = CEPH_NOSNAP,
};
}
+
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
inode = req->r_target_inode;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index eb9028e8cfc5..ef38f01c1795 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -157,7 +157,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
+ cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
@@ -300,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acls_info acls = {};
+ int mask;
int err;
dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -335,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
acls.pagelist = NULL;
}
}
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+
req->r_locked_dir = dir; /* caller holds dir->i_mutex */
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -725,7 +732,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
ret = ceph_osdc_start_request(req->r_osdc, req, false);
out:
if (ret < 0) {
- BUG_ON(ret == -EOLDSNAPC);
req->r_result = ret;
ceph_aio_complete_req(req, NULL);
}
@@ -783,7 +789,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int num_pages = 0;
int flags;
int ret;
- struct timespec mtime = CURRENT_TIME;
+ struct timespec mtime = current_fs_time(inode->i_sb);
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
@@ -949,7 +955,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ret = ceph_osdc_start_request(req->r_osdc,
req, false);
if (ret < 0) {
- BUG_ON(ret == -EOLDSNAPC);
req->r_result = ret;
ceph_aio_complete_req(req, NULL);
}
@@ -988,7 +993,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
int flags;
int check_caps = 0;
int ret;
- struct timespec mtime = CURRENT_TIME;
+ struct timespec mtime = current_fs_time(inode->i_sb);
size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fb4ba2e4e2a5..ed58b168904a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ ci->i_pool_ns_len = 0;
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -548,6 +549,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
(truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
dout("size %lld -> %llu\n", inode->i_size, size);
+ if (size > 0 && S_ISDIR(inode->i_mode)) {
+ pr_err("fill_file_size non-zero size for directory\n");
+ size = 0;
+ }
i_size_write(inode, size);
inode->i_blocks = (size + (1<<9) - 1) >> 9;
ci->i_reported_size = size;
@@ -756,6 +761,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
+ ci->i_pool_ns_len = iinfo->pool_ns_len;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
@@ -975,13 +981,8 @@ out_unlock:
/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
*/
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash)
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
{
struct dentry *realdn;
@@ -994,8 +995,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
- if (prehash)
- *prehash = false; /* don't rehash on error */
dn = realdn; /* note realdn contains the error */
goto out;
} else if (realdn) {
@@ -1011,8 +1010,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
dout("dn %p attached to %p ino %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)));
}
- if ((!prehash || *prehash) && d_unhashed(dn))
- d_rehash(dn);
out:
return dn;
}
@@ -1245,10 +1242,8 @@ retry_lookup:
dout("d_delete %p\n", dn);
d_delete(dn);
} else {
- dout("d_instantiate %p NULL\n", dn);
- d_instantiate(dn, NULL);
if (have_lease && d_unhashed(dn))
- d_rehash(dn);
+ d_add(dn, NULL);
update_dentry_lease(dn, rinfo->dlease,
session,
req->r_request_started);
@@ -1260,7 +1255,7 @@ retry_lookup:
if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in, &have_lease);
+ dn = splice_dentry(dn, in);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
@@ -1270,6 +1265,7 @@ retry_lookup:
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
ceph_vinop(in));
+ d_invalidate(dn);
have_lease = false;
}
@@ -1290,7 +1286,7 @@ retry_lookup:
dout(" linking snapped dir %p to dn %p\n", in, dn);
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in, NULL);
+ dn = splice_dentry(dn, in);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
@@ -1358,15 +1354,20 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
- ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ if (idx == 0)
+ ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ else
+ ctl->page = find_lock_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
- return -ENOMEM;
+ return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
+ if (idx == 0)
+ memset(ctl->dentries, 0, PAGE_CACHE_SIZE);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
@@ -1389,7 +1390,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct qstr dname;
struct dentry *dn;
struct inode *in;
- int err = 0, ret, i;
+ int err = 0, skipped = 0, ret, i;
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
@@ -1501,7 +1502,17 @@ retry_lookup:
}
if (d_really_is_negative(dn)) {
- struct dentry *realdn = splice_dentry(dn, in, NULL);
+ struct dentry *realdn;
+
+ if (ceph_security_xattr_deadlock(in)) {
+ dout(" skip splicing dn %p to inode %p"
+ " (security xattr deadlock)\n", dn, in);
+ iput(in);
+ skipped++;
+ goto next_item;
+ }
+
+ realdn = splice_dentry(dn, in);
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
d_drop(dn);
@@ -1518,7 +1529,7 @@ retry_lookup:
req->r_session,
req->r_request_started);
- if (err == 0 && cache_ctl.index >= 0) {
+ if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
@@ -1529,7 +1540,7 @@ next_item:
dput(dn);
}
out:
- if (err == 0) {
+ if (err == 0 && skipped == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
@@ -1959,7 +1970,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
release &= issued;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e7b130a637f9..44852c3ae531 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ *p += info->pool_ns_len;
+ } else {
+ info->pool_ns_len = 0;
+ }
+
return 0;
bad:
return err;
@@ -1721,7 +1729,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_stamp = CURRENT_TIME;
+ req->r_stamp = current_fs_time(mdsc->fsc->sb);
req->r_op = op;
req->r_direct_mode = mode;
@@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ /* deny access to directories with pool_ns layouts */
+ if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+ ceph_inode(req->r_inode)->i_pool_ns_len)
+ return -EIO;
+ if (req->r_locked_dir &&
+ ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+ return -EIO;
+
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2524,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */
mutex_lock(&req->r_fill_mutex);
+ current->journal_info = req;
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
@@ -2531,6 +2548,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
+ current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
@@ -3748,7 +3766,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
dout("handle_map epoch %u <= our %u\n",
@@ -3775,6 +3792,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
__wake_requests(mdsc, &mdsc->waiting_for_map);
+ ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
+ mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
schedule_delayed(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ccf11ef0ca87..37712ccffcc6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
u64 inline_version;
u32 inline_len;
char *inline_data;
+ u32 pool_ns_len;
};
/*
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4aa7122a8d38..9caaa7ffc93f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,8 +296,6 @@ static int cmpu64_rev(const void *a, const void *b)
}
-struct ceph_snap_context *ceph_empty_snapc;
-
/*
* build the snap context for a given realm.
*/
@@ -987,17 +985,3 @@ out:
up_write(&mdsc->snap_rwsem);
return;
}
-
-int __init ceph_snap_init(void)
-{
- ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
- if (!ceph_empty_snapc)
- return -ENOMEM;
- ceph_empty_snapc->seq = 1;
- return 0;
-}
-
-void ceph_snap_exit(void)
-{
- ceph_put_snap_context(ceph_empty_snapc);
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ca4d5e8457f1..c973043deb0e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -439,8 +439,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
seq_puts(m, ",dirstat");
- if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
+ seq_puts(m, ",rbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.want_mdsmap = 1;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
fsc->mount_options = fsopt;
@@ -793,22 +793,20 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
struct dentry *root;
int first = 0; /* first vfsmount for this super_block */
- dout("mount start\n");
+ dout("mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex);
- err = __ceph_open_session(fsc->client, started);
- if (err < 0)
- goto out;
+ if (!fsc->sb->s_root) {
+ err = __ceph_open_session(fsc->client, started);
+ if (err < 0)
+ goto out;
- dout("mount opening root\n");
- root = open_root_dentry(fsc, "", started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
- if (fsc->sb->s_root) {
- dput(root);
- } else {
+ dout("mount opening root\n");
+ root = open_root_dentry(fsc, "", started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
fsc->sb->s_root = root;
first = 1;
@@ -818,6 +816,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
}
if (path[0] == 0) {
+ root = fsc->sb->s_root;
dget(root);
} else {
dout("mount opening base mountpoint\n");
@@ -833,16 +832,14 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_unlock(&fsc->client->mount_mutex);
return root;
-out:
- mutex_unlock(&fsc->client->mount_mutex);
- return ERR_PTR(err);
-
fail:
if (first) {
dput(fsc->sb->s_root);
fsc->sb->s_root = NULL;
}
- goto out;
+out:
+ mutex_unlock(&fsc->client->mount_mutex);
+ return ERR_PTR(err);
}
static int ceph_set_super(struct super_block *s, void *data)
@@ -1042,19 +1039,14 @@ static int __init init_ceph(void)
ceph_flock_init();
ceph_xattr_init();
- ret = ceph_snap_init();
- if (ret)
- goto out_xattr;
ret = register_filesystem(&ceph_fs_type);
if (ret)
- goto out_snap;
+ goto out_xattr;
pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
return 0;
-out_snap:
- ceph_snap_exit();
out_xattr:
ceph_xattr_exit();
destroy_caches();
@@ -1066,7 +1058,6 @@ static void __exit exit_ceph(void)
{
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
- ceph_snap_exit();
ceph_xattr_exit();
destroy_caches();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75b7d125ce66..e705c4d612d7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -37,8 +37,7 @@
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
-#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \
- CEPH_MOUNT_OPT_DCACHE)
+#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -287,6 +286,7 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
@@ -468,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
-
+#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -720,7 +720,6 @@ static inline int default_congestion_kb(void)
/* snap.c */
-extern struct ceph_snap_context *ceph_empty_snapc;
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino);
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -737,8 +736,6 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
-extern int ceph_snap_init(void);
-extern void ceph_snap_exit(void);
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
@@ -807,6 +804,20 @@ extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
extern const struct xattr_handler *ceph_xattr_handlers[];
+#ifdef CONFIG_SECURITY
+extern bool ceph_security_xattr_deadlock(struct inode *in);
+extern bool ceph_security_xattr_wanted(struct inode *in);
+#else
+static inline bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ return false;
+}
+static inline bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return false;
+}
+#endif
+
/* acl.c */
struct ceph_acls_info {
void *default_acl;
@@ -946,7 +957,6 @@ extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
-extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 819163d8313b..9410abdef3ce 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -714,31 +714,62 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
}
}
+static inline int __get_request_mask(struct inode *in) {
+ struct ceph_mds_request *req = current->journal_info;
+ int mask = 0;
+ if (req && req->r_target_inode == in) {
+ if (req->r_op == CEPH_MDS_OP_LOOKUP ||
+ req->r_op == CEPH_MDS_OP_LOOKUPINO ||
+ req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
+ req->r_op == CEPH_MDS_OP_GETATTR) {
+ mask = le32_to_cpu(req->r_args.getattr.mask);
+ } else if (req->r_op == CEPH_MDS_OP_OPEN ||
+ req->r_op == CEPH_MDS_OP_CREATE) {
+ mask = le32_to_cpu(req->r_args.open.mask);
+ }
+ }
+ return mask;
+}
+
ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int err;
struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr = NULL;
+ int req_mask;
+ int err;
if (!ceph_is_valid_xattr(name))
return -ENODATA;
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
- err = vxattr->getxattr_cb(ci, value, size);
+ if (vxattr) {
+ err = -ENODATA;
+ if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
+ err = vxattr->getxattr_cb(ci, value, size);
return err;
}
+ req_mask = __get_request_mask(inode);
+
spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
- !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+ !((req_mask & CEPH_CAP_XATTR_SHARED) ||
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock);
+
+ /* security module gets xattr while filling trace */
+ if (current->journal_info != NULL) {
+ pr_warn_ratelimited("sync getxattr %p "
+ "during filling trace\n", inode);
+ return -EBUSY;
+ }
+
/* get xattrs from mds (if we don't already have them) */
err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
@@ -765,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
memcpy(value, xattr->val, xattr->val_len);
+ if (current->journal_info != NULL &&
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+ ci->i_ceph_flags |= CEPH_I_SEC_INITED;
out:
spin_unlock(&ci->i_ceph_lock);
return err;
@@ -999,7 +1033,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
spin_unlock(&ci->i_ceph_lock);
@@ -1015,7 +1049,15 @@ do_sync:
do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
- err = ceph_sync_setxattr(dentry, name, value, size, flags);
+
+ /* security module set xattr while filling trace */
+ if (current->journal_info != NULL) {
+ pr_warn_ratelimited("sync setxattr %p "
+ "during filling trace\n", inode);
+ err = -EBUSY;
+ } else {
+ err = ceph_sync_setxattr(dentry, name, value, size, flags);
+ }
out:
ceph_free_cap_flush(prealloc_cf);
kfree(newname);
@@ -1136,7 +1178,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
@@ -1164,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return __ceph_removexattr(dentry, name);
}
+
+#ifdef CONFIG_SECURITY
+bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return in->i_security != NULL;
+}
+
+bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ struct ceph_inode_info *ci;
+ bool ret;
+ if (in->i_security == NULL)
+ return false;
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
+ !(ci->i_xattrs.version > 0 &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+#endif
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 50b268483302..788e19195991 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = {
static ssize_t cifs_stats_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
bool bv;
int rc;
struct list_head *tmp1, *tmp2, *tmp3;
@@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- rc = get_user(c, buffer);
- if (rc)
- return rc;
-
- if (strtobool(&c, &bv) == 0) {
+ rc = kstrtobool_from_user(buffer, count, &bv);
+ if (rc == 0) {
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
@@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
}
}
spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ return rc;
}
return count;
@@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file)
static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
+ char c[2] = { '\0' };
bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = get_user(c[0], buffer);
if (rc)
return rc;
- if (strtobool(&c, &bv) == 0)
+ if (strtobool(c, &bv) == 0)
cifsFYI = bv;
- else if ((c > '1') && (c <= '9'))
- cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+ else if ((c[0] > '1') && (c[0] <= '9'))
+ cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
return count;
}
@@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_linux_ext_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- linuxExtEnabled = bv;
-
return count;
}
@@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_lookup_cache_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- lookupCacheEnabled = bv;
-
return count;
}
@@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file)
static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &traceSMB);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- traceSMB = bv;
-
return count;
}
@@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
int rc;
unsigned int flags;
char flags_string[12];
- char c;
bool bv;
if ((count < 1) || (count > 11))
@@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if (count < 3) {
/* single char or single char followed by null */
- c = flags_string[0];
- if (strtobool(&c, &bv) == 0) {
+ if (strtobool(flags_string, &bv) == 0) {
global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
return count;
- } else if (!isdigit(c)) {
+ } else if (!isdigit(flags_string[0])) {
cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
flags_string);
return -EINVAL;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 66cf0f9fff89..c611ca2339d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,7 +25,7 @@
void cifs_dump_mem(char *label, void *data, int length);
void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
-extern int traceSMB; /* flag which enables the function below */
+extern bool traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
#define CIFS_INFO 0x01
#define CIFS_RC 0x02
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e682b36a210f..4897dacf8944 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -33,6 +33,7 @@
#include <linux/ctype.h>
#include <linux/random.h>
#include <linux/highmem.h>
+#include <crypto/skcipher.h>
static int
cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
@@ -789,38 +790,46 @@ int
calc_seckey(struct cifs_ses *ses)
{
int rc;
- struct crypto_blkcipher *tfm_arc4;
+ struct crypto_skcipher *tfm_arc4;
struct scatterlist sgin, sgout;
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
- tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_arc4)) {
rc = PTR_ERR(tfm_arc4);
cifs_dbg(VFS, "could not allocate crypto API arc4\n");
return rc;
}
- desc.tfm = tfm_arc4;
-
- rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+ rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
CIFS_SESS_KEY_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set response as a key\n",
__func__);
- return rc;
+ goto out_free_cipher;
+ }
+
+ req = skcipher_request_alloc(tfm_arc4, GFP_KERNEL);
+ if (!req) {
+ rc = -ENOMEM;
+ cifs_dbg(VFS, "could not allocate crypto API arc4 request\n");
+ goto out_free_cipher;
}
sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
- rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sgin, &sgout, CIFS_CPHTXT_SIZE, NULL);
+
+ rc = crypto_skcipher_encrypt(req);
+ skcipher_request_free(req);
if (rc) {
cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
- crypto_free_blkcipher(tfm_arc4);
- return rc;
+ goto out_free_cipher;
}
/* make secondary_key/nonce as session key */
@@ -828,7 +837,8 @@ calc_seckey(struct cifs_ses *ses)
/* and make len as that of session key only */
ses->auth_key.len = CIFS_SESS_KEY_SIZE;
- crypto_free_blkcipher(tfm_arc4);
+out_free_cipher:
+ crypto_free_skcipher(tfm_arc4);
return rc;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c48ca13673e3..1d86fc620e5c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,10 +54,10 @@
#endif
int cifsFYI = 0;
-int traceSMB = 0;
+bool traceSMB;
bool enable_oplocks = true;
-unsigned int linuxExtEnabled = 1;
-unsigned int lookupCacheEnabled = 1;
+bool linuxExtEnabled = true;
+bool lookupCacheEnabled = true;
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
@@ -642,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
while (*s && *s != sep)
s++;
- inode_lock(dir);
- child = lookup_one_len(p, dentry, s - p);
- inode_unlock(dir);
+ child = lookup_one_len_unlocked(p, dentry, s - p);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -1013,7 +1011,6 @@ const struct file_operations cifs_file_strict_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.clone_file_range = cifs_clone_file_range,
- .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 68c4547528c4..83aac8ba50b0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -31,19 +31,15 @@
* so that it will fit. We use hash_64 to convert the value to 31 bits, and
* then add 1, to ensure that we don't end up with a 0 as the value.
*/
-#if BITS_PER_LONG == 64
static inline ino_t
cifs_uniqueid_to_ino_t(u64 fileid)
{
+ if ((sizeof(ino_t)) < (sizeof(u64)))
+ return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
return (ino_t)fileid;
+
}
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
- return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a25b2513f146..d21da9f05bae 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1596,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN bool lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 90b4f9f7de66..76fcb50295a3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1396,11 +1396,10 @@ openRetry:
* current bigbuf.
*/
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
{
unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
- struct cifs_readdata *rdata = mid->callback_data;
while (remaining > 0) {
int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
remaining -= length;
}
- dequeue_mid(mid, rdata->result);
return 0;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ length = discard_remaining_data(server);
+ dequeue_mid(mid, rdata->result);
+ return length;
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server, 0)) {
+ discard_remaining_data(server);
+ return -1;
+ }
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 10f8d5cf5681..42e1f440eb1e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1106,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
{
char *data_offset;
struct create_context *cc;
- unsigned int next = 0;
+ unsigned int next;
+ unsigned int remaining;
char *name;
data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
- do {
- cc = (struct create_context *)((char *)cc + next);
+ while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) != 4 ||
- strncmp(name, "RqLs", 4)) {
- next = le32_to_cpu(cc->Next);
- continue;
- }
- return server->ops->parse_lease_buf(cc, epoch);
- } while (next != 0);
+ if (le16_to_cpu(cc->NameLength) == 4 &&
+ strncmp(name, "RqLs", 4) == 0)
+ return server->ops->parse_lease_buf(cc, epoch);
+
+ next = le32_to_cpu(cc->Next);
+ if (!next)
+ break;
+ remaining -= next;
+ cc = (struct create_context *)((char *)cc + next);
+ }
return 0;
}
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a4232ec4f2ba..699b7868108f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -23,6 +23,7 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -70,31 +71,42 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
{
int rc;
unsigned char key2[8];
- struct crypto_blkcipher *tfm_des;
+ struct crypto_skcipher *tfm_des;
struct scatterlist sgin, sgout;
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
str_to_key(key, key2);
- tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+ tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_des)) {
rc = PTR_ERR(tfm_des);
cifs_dbg(VFS, "could not allocate des crypto API\n");
goto smbhash_err;
}
- desc.tfm = tfm_des;
+ req = skcipher_request_alloc(tfm_des, GFP_KERNEL);
+ if (!req) {
+ rc = -ENOMEM;
+ cifs_dbg(VFS, "could not allocate des crypto API\n");
+ goto smbhash_free_skcipher;
+ }
- crypto_blkcipher_setkey(tfm_des, key2, 8);
+ crypto_skcipher_setkey(tfm_des, key2, 8);
sg_init_one(&sgin, in, 8);
sg_init_one(&sgout, out, 8);
- rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL);
+
+ rc = crypto_skcipher_encrypt(req);
if (rc)
cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
- crypto_free_blkcipher(tfm_des);
+ skcipher_request_free(req);
+
+smbhash_free_skcipher:
+ crypto_free_skcipher(tfm_des);
smbhash_err:
return rc;
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6402eaf8ab95..bd01b92aad98 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1040,28 +1040,6 @@ COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
/* PPPOX */
COMPATIBLE_IOCTL(PPPOEIOCSFWD)
COMPATIBLE_IOCTL(PPPOEIOCDFWD)
-/* ppdev */
-COMPATIBLE_IOCTL(PPSETMODE)
-COMPATIBLE_IOCTL(PPRSTATUS)
-COMPATIBLE_IOCTL(PPRCONTROL)
-COMPATIBLE_IOCTL(PPWCONTROL)
-COMPATIBLE_IOCTL(PPFCONTROL)
-COMPATIBLE_IOCTL(PPRDATA)
-COMPATIBLE_IOCTL(PPWDATA)
-COMPATIBLE_IOCTL(PPCLAIM)
-COMPATIBLE_IOCTL(PPRELEASE)
-COMPATIBLE_IOCTL(PPYIELD)
-COMPATIBLE_IOCTL(PPEXCL)
-COMPATIBLE_IOCTL(PPDATADIR)
-COMPATIBLE_IOCTL(PPNEGOT)
-COMPATIBLE_IOCTL(PPWCTLONIRQ)
-COMPATIBLE_IOCTL(PPCLRIRQ)
-COMPATIBLE_IOCTL(PPSETPHASE)
-COMPATIBLE_IOCTL(PPGETMODES)
-COMPATIBLE_IOCTL(PPGETMODE)
-COMPATIBLE_IOCTL(PPGETPHASE)
-COMPATIBLE_IOCTL(PPGETFLAGS)
-COMPATIBLE_IOCTL(PPSETFLAGS)
/* Big A */
/* sparc only */
/* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index f419519ec41f..ea59c891fc53 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -432,14 +432,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
(sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
configfs_init_bin_file :
configfs_init_file);
- if (error) {
+ if (error)
configfs_put(sd);
- return error;
- }
-
- d_rehash(dentry);
-
- return 0;
+ return error;
}
static struct dentry * configfs_lookup(struct inode *dir,
@@ -701,23 +696,29 @@ static int populate_groups(struct config_group *group)
{
struct config_group *new_group;
int ret = 0;
- int i;
-
- if (group->default_groups) {
- for (i = 0; group->default_groups[i]; i++) {
- new_group = group->default_groups[i];
- ret = create_default_group(group, new_group);
- if (ret) {
- detach_groups(group);
- break;
- }
+ list_for_each_entry(new_group, &group->default_groups, group_entry) {
+ ret = create_default_group(group, new_group);
+ if (ret) {
+ detach_groups(group);
+ break;
}
}
return ret;
}
+void configfs_remove_default_groups(struct config_group *group)
+{
+ struct config_group *g, *n;
+
+ list_for_each_entry_safe(g, n, &group->default_groups, group_entry) {
+ list_del(&g->group_entry);
+ config_item_put(&g->cg_item);
+ }
+}
+EXPORT_SYMBOL(configfs_remove_default_groups);
+
/*
* All of link_obj/unlink_obj/link_group/unlink_group require that
* subsys->su_mutex is held.
@@ -766,15 +767,10 @@ static void link_obj(struct config_item *parent_item, struct config_item *item)
static void unlink_group(struct config_group *group)
{
- int i;
struct config_group *new_group;
- if (group->default_groups) {
- for (i = 0; group->default_groups[i]; i++) {
- new_group = group->default_groups[i];
- unlink_group(new_group);
- }
- }
+ list_for_each_entry(new_group, &group->default_groups, group_entry)
+ unlink_group(new_group);
group->cg_subsys = NULL;
unlink_obj(&group->cg_item);
@@ -782,7 +778,6 @@ static void unlink_group(struct config_group *group)
static void link_group(struct config_group *parent_group, struct config_group *group)
{
- int i;
struct config_group *new_group;
struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
@@ -796,12 +791,8 @@ static void link_group(struct config_group *parent_group, struct config_group *g
BUG();
group->cg_subsys = subsys;
- if (group->default_groups) {
- for (i = 0; group->default_groups[i]; i++) {
- new_group = group->default_groups[i];
- link_group(group, new_group);
- }
- }
+ list_for_each_entry(new_group, &group->default_groups, group_entry)
+ link_group(group, new_group);
}
/*
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cee087d8f7e0..03d124ae27d7 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -75,7 +75,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
sd_iattr->ia_mode = sd->s_mode;
sd_iattr->ia_uid = GLOBAL_ROOT_UID;
sd_iattr->ia_gid = GLOBAL_ROOT_GID;
- sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
+ sd_iattr->ia_atime = sd_iattr->ia_mtime =
+ sd_iattr->ia_ctime = current_fs_time(inode->i_sb);
sd->s_iattr = sd_iattr;
}
/* attributes were changed atleast once in past */
@@ -111,7 +112,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime =
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -195,13 +197,21 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
return -ENOMEM;
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+ p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb);
configfs_set_inode_lock_class(sd, inode);
init(inode);
- d_instantiate(dentry, inode);
- if (S_ISDIR(mode) || S_ISLNK(mode))
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ /*
+ * ->symlink(), ->mkdir(), configfs_register_subsystem() or
+ * create_default_group() - already hashed.
+ */
+ d_instantiate(dentry, inode);
dget(dentry); /* pin link and directory dentries in core */
+ } else {
+ /* ->lookup() */
+ d_add(dentry, inode);
+ }
return error;
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index b863a09cd2f1..8b2a994042dd 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -182,6 +182,7 @@ void config_group_init(struct config_group *group)
{
config_item_init(&group->cg_item);
INIT_LIST_HEAD(&group->cg_children);
+ INIT_LIST_HEAD(&group->default_groups);
}
EXPORT_SYMBOL(config_group_init);
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9fdccf..47c32c3bfa1d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,9 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
#include <linux/timekeeping.h>
#include <asm/uaccess.h>
@@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo)
}
} else {
struct inode *inode;
+ int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL;
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
@@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo)
* what matters is that at least one of the two processes
* writes its coredump successfully, not which one.
*/
- cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW |
- O_LARGEFILE | O_EXCL,
- 0600);
+ if (need_suid_safe) {
+ /*
+ * Using user namespaces, normal user tasks can change
+ * their current->fs->root to point to arbitrary
+ * directories. Since the intention of the "only dump
+ * with a fully qualified path" rule is to control where
+ * coredumps may be placed using root privileges,
+ * current->fs->root must not be used. Instead, use the
+ * root directory of init_task.
+ */
+ struct path root;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+ cprm.file = file_open_root(root.dentry, root.mnt,
+ cn.corename, open_flags, 0600);
+ path_put(&root);
+ } else {
+ cprm.file = filp_open(cn.corename, open_flags, 0600);
+ }
if (IS_ERR(cprm.file))
goto fail_unlock;
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
+config FS_ENCRYPTION
+ tristate "FS Encryption (Per-file encryption)"
+ depends on BLOCK
+ select CRYPTO
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_ECB
+ select CRYPTO_XTS
+ select CRYPTO_CTS
+ select CRYPTO_CTR
+ select CRYPTO_SHA256
+ select KEYS
+ select ENCRYPTED_KEYS
+ help
+ Enable encryption of files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..f17684c48739
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
+
+fscrypto-y := crypto.o fname.o policy.o keyinfo.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..06cd1a22240b
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,555 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <linux/fscrypto.h>
+#include <linux/ecryptfs.h>
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+ "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+ "Number of crypto contexts to preallocate");
+
+static mempool_t *fscrypt_bounce_page_pool = NULL;
+
+static LIST_HEAD(fscrypt_free_ctxs);
+static DEFINE_SPINLOCK(fscrypt_ctx_lock);
+
+static struct workqueue_struct *fscrypt_read_workqueue;
+static DEFINE_MUTEX(fscrypt_init_mutex);
+
+static struct kmem_cache *fscrypt_ctx_cachep;
+struct kmem_cache *fscrypt_info_cachep;
+
+/**
+ * fscrypt_release_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+ unsigned long flags;
+
+ if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+ mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
+ ctx->w.bounce_page = NULL;
+ }
+ ctx->w.control_page = NULL;
+ if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+ kmem_cache_free(fscrypt_ctx_cachep, ctx);
+ } else {
+ spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+ list_add(&ctx->free_list, &fscrypt_free_ctxs);
+ spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+ }
+}
+EXPORT_SYMBOL(fscrypt_release_ctx);
+
+/**
+ * fscrypt_get_ctx() - Gets an encryption context
+ * @inode: The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode)
+{
+ struct fscrypt_ctx *ctx = NULL;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ unsigned long flags;
+
+ if (ci == NULL)
+ return ERR_PTR(-ENOKEY);
+
+ /*
+ * We first try getting the ctx from a free list because in
+ * the common case the ctx will have an allocated and
+ * initialized crypto tfm, so it's probably a worthwhile
+ * optimization. For the bounce page, we first try getting it
+ * from the kernel allocator because that's just about as fast
+ * as getting it from a list and because a cache of free pages
+ * should generally be a "last resort" option for a filesystem
+ * to be able to do its job.
+ */
+ spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+ ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
+ struct fscrypt_ctx, free_list);
+ if (ctx)
+ list_del(&ctx->free_list);
+ spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+ if (!ctx) {
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags &= ~FS_WRITE_PATH_FL;
+ return ctx;
+}
+EXPORT_SYMBOL(fscrypt_get_ctx);
+
+/**
+ * fscrypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void fscrypt_complete(struct crypto_async_request *req, int res)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+typedef enum {
+ FS_DECRYPT = 0,
+ FS_ENCRYPT,
+} fscrypt_direction_t;
+
+static int do_page_crypto(struct inode *inode,
+ fscrypt_direction_t rw, pgoff_t index,
+ struct page *src_page, struct page *dest_page)
+{
+ u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist dst, src;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ fscrypt_complete, &ecr);
+
+ BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
+ memcpy(xts_tweak, &index, sizeof(index));
+ memset(&xts_tweak[sizeof(index)], 0,
+ FS_XTS_TWEAK_SIZE - sizeof(index));
+
+ sg_init_table(&dst, 1);
+ sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+ sg_init_table(&src, 1);
+ sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+ skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+ xts_tweak);
+ if (rw == FS_DECRYPT)
+ res = crypto_skcipher_decrypt(req);
+ else
+ res = crypto_skcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ skcipher_request_free(req);
+ if (res) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_skcipher_encrypt() returned %d\n",
+ __func__, res);
+ return res;
+ }
+ return 0;
+}
+
+static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
+{
+ ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool,
+ GFP_NOWAIT);
+ if (ctx->w.bounce_page == NULL)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= FS_WRITE_PATH_FL;
+ return ctx->w.bounce_page;
+}
+
+/**
+ * fscypt_encrypt_page() - Encrypts a page
+ * @inode: The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path. The caller must call
+ * fscrypt_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *fscrypt_encrypt_page(struct inode *inode,
+ struct page *plaintext_page)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ int err;
+
+ BUG_ON(!PageLocked(plaintext_page));
+
+ ctx = fscrypt_get_ctx(inode);
+ if (IS_ERR(ctx))
+ return (struct page *)ctx;
+
+ /* The encryption operation will require a bounce page. */
+ ciphertext_page = alloc_bounce_page(ctx);
+ if (IS_ERR(ciphertext_page))
+ goto errout;
+
+ ctx->w.control_page = plaintext_page;
+ err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
+ plaintext_page, ciphertext_page);
+ if (err) {
+ ciphertext_page = ERR_PTR(err);
+ goto errout;
+ }
+ SetPagePrivate(ciphertext_page);
+ set_page_private(ciphertext_page, (unsigned long)ctx);
+ lock_page(ciphertext_page);
+ return ciphertext_page;
+
+errout:
+ fscrypt_release_ctx(ctx);
+ return ciphertext_page;
+}
+EXPORT_SYMBOL(fscrypt_encrypt_page);
+
+/**
+ * f2crypt_decrypt_page() - Decrypts a page in-place
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_decrypt_page(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ return do_page_crypto(page->mapping->host,
+ FS_DECRYPT, page->index, page, page);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_page);
+
+int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+ sector_t pblk, unsigned int len)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ struct bio *bio;
+ int ret, err = 0;
+
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+
+ ctx = fscrypt_get_ctx(inode);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ciphertext_page = alloc_bounce_page(ctx);
+ if (IS_ERR(ciphertext_page)) {
+ err = PTR_ERR(ciphertext_page);
+ goto errout;
+ }
+
+ while (len--) {
+ err = do_page_crypto(inode, FS_ENCRYPT, lblk,
+ ZERO_PAGE(0), ciphertext_page);
+ if (err)
+ goto errout;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ ret = bio_add_page(bio, ciphertext_page,
+ inode->i_sb->s_blocksize, 0);
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ WARN_ON(1);
+ bio_put(bio);
+ err = -EIO;
+ goto errout;
+ }
+ err = submit_bio_wait(WRITE, bio);
+ if ((err == 0) && bio->bi_error)
+ err = -EIO;
+ bio_put(bio);
+ if (err)
+ goto errout;
+ lblk++;
+ pblk++;
+ }
+ err = 0;
+errout:
+ fscrypt_release_ctx(ctx);
+ return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct fscrypt_info *ci = dir->i_crypt_info;
+ int dir_has_key, cached_with_key;
+
+ if (!dir->i_sb->s_cop->is_encrypted(dir))
+ return 0;
+
+ if (ci && ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD))))
+ ci = NULL;
+
+ /* this should eventually be an flag in d_flags */
+ spin_lock(&dentry->d_lock);
+ cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
+ spin_unlock(&dentry->d_lock);
+ dir_has_key = (ci != NULL);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key))
+ return 0;
+ return 1;
+}
+
+const struct dentry_operations fscrypt_d_ops = {
+ .d_revalidate = fscrypt_d_revalidate,
+};
+EXPORT_SYMBOL(fscrypt_d_ops);
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+ struct fscrypt_ctx *ctx =
+ container_of(work, struct fscrypt_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+ int ret = fscrypt_decrypt_page(page);
+
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ fscrypt_release_ctx(ctx);
+ bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *bounce_page;
+
+ /* The bounce data pages are unmapped. */
+ if ((*page)->mapping)
+ return;
+
+ /* The bounce data page is unmapped. */
+ bounce_page = *page;
+ ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+ /* restore control page */
+ *page = ctx->w.control_page;
+
+ if (restore)
+ fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+void fscrypt_restore_control_page(struct page *page)
+{
+ struct fscrypt_ctx *ctx;
+
+ ctx = (struct fscrypt_ctx *)page_private(page);
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ unlock_page(page);
+ fscrypt_release_ctx(ctx);
+}
+EXPORT_SYMBOL(fscrypt_restore_control_page);
+
+static void fscrypt_destroy(void)
+{
+ struct fscrypt_ctx *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
+ kmem_cache_free(fscrypt_ctx_cachep, pos);
+ INIT_LIST_HEAD(&fscrypt_free_ctxs);
+ mempool_destroy(fscrypt_bounce_page_pool);
+ fscrypt_bounce_page_pool = NULL;
+}
+
+/**
+ * fscrypt_initialize() - allocate major buffers for fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_initialize(void)
+{
+ int i, res = -ENOMEM;
+
+ if (fscrypt_bounce_page_pool)
+ return 0;
+
+ mutex_lock(&fscrypt_init_mutex);
+ if (fscrypt_bounce_page_pool)
+ goto already_initialized;
+
+ for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+ struct fscrypt_ctx *ctx;
+
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+ if (!ctx)
+ goto fail;
+ list_add(&ctx->free_list, &fscrypt_free_ctxs);
+ }
+
+ fscrypt_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+ if (!fscrypt_bounce_page_pool)
+ goto fail;
+
+already_initialized:
+ mutex_unlock(&fscrypt_init_mutex);
+ return 0;
+fail:
+ fscrypt_destroy();
+ mutex_unlock(&fscrypt_init_mutex);
+ return res;
+}
+EXPORT_SYMBOL(fscrypt_initialize);
+
+/**
+ * fscrypt_init() - Set up for fs encryption.
+ */
+static int __init fscrypt_init(void)
+{
+ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
+ WQ_HIGHPRI, 0);
+ if (!fscrypt_read_workqueue)
+ goto fail;
+
+ fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_ctx_cachep)
+ goto fail_free_queue;
+
+ fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_info_cachep)
+ goto fail_free_ctx;
+
+ return 0;
+
+fail_free_ctx:
+ kmem_cache_destroy(fscrypt_ctx_cachep);
+fail_free_queue:
+ destroy_workqueue(fscrypt_read_workqueue);
+fail:
+ return -ENOMEM;
+}
+module_init(fscrypt_init)
+
+/**
+ * fscrypt_exit() - Shutdown the fs encryption system
+ */
+static void __exit fscrypt_exit(void)
+{
+ fscrypt_destroy();
+
+ if (fscrypt_read_workqueue)
+ destroy_workqueue(fscrypt_read_workqueue);
+ kmem_cache_destroy(fscrypt_ctx_cachep);
+ kmem_cache_destroy(fscrypt_info_cachep);
+}
+module_exit(fscrypt_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c
index ab377d496a39..5d6d49113efa 100644
--- a/fs/f2fs/crypto_fname.c
+++ b/fs/crypto/fname.c
@@ -1,46 +1,32 @@
/*
- * linux/fs/f2fs/crypto_fname.c
- *
- * Copied from linux/fs/ext4/crypto.c
+ * This contains functions for filename crypto management
*
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility
*
- * This contains functions for filename crypto management in f2fs
- *
* Written by Uday Savagaonkar, 2014.
- *
- * Adjust f2fs dentry structure
- * Jaegeuk Kim, 2015.
+ * Modified by Jaegeuk Kim, 2015.
*
* This has not yet undergone a rigorous security audit.
*/
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+
#include <keys/encrypted-type.h>
#include <keys/user-type.h>
-#include <linux/crypto.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/random.h>
#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
#include <linux/ratelimit.h>
+#include <linux/fscrypto.h>
-#include "f2fs.h"
-#include "f2fs_crypto.h"
-#include "xattr.h"
+static u32 size_round_up(size_t size, size_t blksize)
+{
+ return ((size + blksize - 1) / blksize) * blksize;
+}
/**
- * f2fs_dir_crypt_complete() -
+ * dir_crypt_complete() -
*/
-static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
+static void dir_crypt_complete(struct crypto_async_request *req, int res)
{
- struct f2fs_completion_result *ecr = req->data;
+ struct fscrypt_completion_result *ecr = req->data;
if (res == -EINPROGRESS)
return;
@@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
complete(&ecr->completion);
}
-bool f2fs_valid_filenames_enc_mode(uint32_t mode)
-{
- return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static unsigned max_name_len(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
- F2FS_NAME_LEN;
-}
-
/**
- * f2fs_fname_encrypt() -
+ * fname_encrypt() -
*
* This function encrypts the input filename, and returns the length of the
* ciphertext. Errors are returned as negative numbers. We trust the caller to
* allocate sufficient memory to oname string.
*/
-static int f2fs_fname_encrypt(struct inode *inode,
- const struct qstr *iname, struct f2fs_str *oname)
+static int fname_encrypt(struct inode *inode,
+ const struct qstr *iname, struct fscrypt_str *oname)
{
u32 ciphertext_len;
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- char iv[F2FS_CRYPTO_BLOCK_SIZE];
+ char iv[FS_CRYPTO_BLOCK_SIZE];
struct scatterlist src_sg, dst_sg;
- int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+ int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
char *workbuf, buf[32], *alloc_buf = NULL;
- unsigned lim = max_name_len(inode);
+ unsigned lim;
+ lim = inode->i_sb->s_cop->max_namelen(inode);
if (iname->len <= 0 || iname->len > lim)
return -EIO;
- ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
- F2FS_CRYPTO_BLOCK_SIZE : iname->len;
- ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
+ ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
+ FS_CRYPTO_BLOCK_SIZE : iname->len;
+ ciphertext_len = size_round_up(ciphertext_len, padding);
ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
if (ciphertext_len <= sizeof(buf)) {
@@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode,
}
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n", __func__);
kfree(alloc_buf);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- f2fs_dir_crypt_complete, &ecr);
+ dir_crypt_complete, &ecr);
/* Copy the input */
memcpy(workbuf, iname->name, iname->len);
@@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode,
memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
/* Initialize IV */
- memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
/* Create encryption request */
sg_init_one(&src_sg, workbuf, ciphertext_len);
sg_init_one(&dst_sg, oname->name, ciphertext_len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
- res = crypto_ablkcipher_encrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
kfree(alloc_buf);
- ablkcipher_request_free(req);
- if (res < 0) {
+ skcipher_request_free(req);
+ if (res < 0)
printk_ratelimited(KERN_ERR
"%s: Error (error code %d)\n", __func__, res);
- }
+
oname->len = ciphertext_len;
return res;
}
/*
- * f2fs_fname_decrypt()
+ * fname_decrypt()
* This function decrypts the input filename, and returns
* the length of the plaintext.
* Errors are returned as negative numbers.
* We trust the caller to allocate sufficient memory to oname string.
*/
-static int f2fs_fname_decrypt(struct inode *inode,
- const struct f2fs_str *iname, struct f2fs_str *oname)
+static int fname_decrypt(struct inode *inode,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
{
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
struct scatterlist src_sg, dst_sg;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- char iv[F2FS_CRYPTO_BLOCK_SIZE];
- unsigned lim = max_name_len(inode);
+ char iv[FS_CRYPTO_BLOCK_SIZE];
+ unsigned lim;
+ lim = inode->i_sb->s_cop->max_namelen(inode);
if (iname->len <= 0 || iname->len > lim)
return -EIO;
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n", __func__);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- f2fs_dir_crypt_complete, &ecr);
+ dir_crypt_complete, &ecr);
/* Initialize IV */
- memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
/* Create decryption request */
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_skcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(KERN_ERR
- "%s: Error in f2fs_fname_decrypt (error code %d)\n",
- __func__, res);
+ "%s: Error (error code %d)\n", __func__, res);
return res;
}
@@ -200,7 +175,7 @@ static const char *lookup_table =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
/**
- * f2fs_fname_encode_digest() -
+ * digest_encode() -
*
* Encodes the input digest using characters from the set [a-zA-Z0-9_+].
* The encoded string is roughly 4/3 times the size of the input string.
@@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst)
return cp - dst;
}
-/**
- * f2fs_fname_crypto_round_up() -
- *
- * Return: The next multiple of block size
- */
-u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
+u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
{
- return ((size + blksize - 1) / blksize) * blksize;
+ int padding = 32;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+ if (ilen < FS_CRYPTO_BLOCK_SIZE)
+ ilen = FS_CRYPTO_BLOCK_SIZE;
+ return size_round_up(ilen, padding);
}
+EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
/**
- * f2fs_fname_crypto_alloc_obuff() -
+ * fscrypt_fname_crypto_alloc_obuff() -
*
* Allocates an output buffer that is sufficient for the crypto operation
* specified by the context and the direction.
*/
-int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
- u32 ilen, struct f2fs_str *crypto_str)
+int fscrypt_fname_alloc_buffer(struct inode *inode,
+ u32 ilen, struct fscrypt_str *crypto_str)
{
- unsigned int olen;
- int padding = 16;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+ unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
- if (ci)
- padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
- if (padding < F2FS_CRYPTO_BLOCK_SIZE)
- padding = F2FS_CRYPTO_BLOCK_SIZE;
- olen = f2fs_fname_crypto_round_up(ilen, padding);
crypto_str->len = olen;
- if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
- olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
- /* Allocated buffer can hold one more character to null-terminate the
- * string */
+ if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+ olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+ /*
+ * Allocated buffer can hold one more character to null-terminate the
+ * string
+ */
crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
if (!(crypto_str->name))
return -ENOMEM;
return 0;
}
+EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
/**
- * f2fs_fname_crypto_free_buffer() -
+ * fscrypt_fname_crypto_free_buffer() -
*
* Frees the buffer allocated for crypto operation.
*/
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
+void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
if (!crypto_str)
return;
kfree(crypto_str->name);
crypto_str->name = NULL;
}
+EXPORT_SYMBOL(fscrypt_fname_free_buffer);
/**
- * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
+ * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
*/
-int f2fs_fname_disk_to_usr(struct inode *inode,
- f2fs_hash_t *hash,
- const struct f2fs_str *iname,
- struct f2fs_str *oname)
+int fscrypt_fname_disk_to_usr(struct inode *inode,
+ u32 hash, u32 minor_hash,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
{
const struct qstr qname = FSTR_TO_QSTR(iname);
char buf[24];
int ret;
- if (is_dot_dotdot(&qname)) {
+ if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
oname->name[iname->len - 1] = '.';
oname->len = iname->len;
return oname->len;
}
- if (F2FS_I(inode)->i_crypt_info)
- return f2fs_fname_decrypt(inode, iname, oname);
+ if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+ return -EUCLEAN;
- if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
+ if (inode->i_crypt_info)
+ return fname_decrypt(inode, iname, oname);
+
+ if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
ret = digest_encode(iname->name, iname->len, oname->name);
oname->len = ret;
return ret;
}
if (hash) {
- memcpy(buf, hash, 4);
- memset(buf + 4, 0, 4);
- } else
+ memcpy(buf, &hash, 4);
+ memcpy(buf + 4, &minor_hash, 4);
+ } else {
memset(buf, 0, 8);
+ }
memcpy(buf + 8, iname->name + iname->len - 16, 16);
oname->name[0] = '_';
ret = digest_encode(buf, 24, oname->name + 1);
oname->len = ret + 1;
return ret + 1;
}
+EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
/**
- * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
+ * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
+ * space
*/
-int f2fs_fname_usr_to_disk(struct inode *inode,
+int fscrypt_fname_usr_to_disk(struct inode *inode,
const struct qstr *iname,
- struct f2fs_str *oname)
+ struct fscrypt_str *oname)
{
- int res;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
- if (is_dot_dotdot(iname)) {
+ if (fscrypt_is_dot_dotdot(iname)) {
oname->name[0] = '.';
oname->name[iname->len - 1] = '.';
oname->len = iname->len;
return oname->len;
}
-
- if (ci) {
- res = f2fs_fname_encrypt(inode, iname, oname);
- return res;
- }
- /* Without a proper key, a user is not allowed to modify the filenames
+ if (inode->i_crypt_info)
+ return fname_encrypt(inode, iname, oname);
+ /*
+ * Without a proper key, a user is not allowed to modify the filenames
* in a directory. Consequently, a user space name cannot be mapped to
- * a disk-space name */
+ * a disk-space name
+ */
return -EACCES;
}
+EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
-int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
- int lookup, struct f2fs_filename *fname)
+int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct fscrypt_name *fname)
{
- struct f2fs_crypt_info *ci;
int ret = 0, bigname = 0;
- memset(fname, 0, sizeof(struct f2fs_filename));
+ memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
- if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
+ if (!dir->i_sb->s_cop->is_encrypted(dir) ||
+ fscrypt_is_dot_dotdot(iname)) {
fname->disk_name.name = (unsigned char *)iname->name;
fname->disk_name.len = iname->len;
return 0;
}
- ret = f2fs_get_encryption_info(dir);
- if (ret)
+ ret = get_crypt_info(dir);
+ if (ret && ret != -EOPNOTSUPP)
return ret;
- ci = F2FS_I(dir)->i_crypt_info;
- if (ci) {
- ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
- &fname->crypto_buf);
+
+ if (dir->i_crypt_info) {
+ ret = fscrypt_fname_alloc_buffer(dir, iname->len,
+ &fname->crypto_buf);
if (ret < 0)
return ret;
- ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
+ ret = fname_encrypt(dir, iname, &fname->crypto_buf);
if (ret < 0)
goto errout;
fname->disk_name.name = fname->crypto_buf.name;
@@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
if (!lookup)
return -EACCES;
- /* We don't have the key and we are doing a lookup; decode the
+ /*
+ * We don't have the key and we are doing a lookup; decode the
* user-supplied name
*/
if (iname->name[0] == '_')
bigname = 1;
- if ((bigname && (iname->len != 33)) ||
- (!bigname && (iname->len > 43)))
+ if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
return -ENOENT;
fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
+
ret = digest_decode(iname->name + bigname, iname->len - bigname,
fname->crypto_buf.name);
if (ret < 0) {
@@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
fname->crypto_buf.len = ret;
if (bigname) {
memcpy(&fname->hash, fname->crypto_buf.name, 4);
+ memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
} else {
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
}
return 0;
+
errout:
- f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
+ fscrypt_fname_free_buffer(&fname->crypto_buf);
return ret;
}
+EXPORT_SYMBOL(fscrypt_setup_filename);
-void f2fs_fname_free_filename(struct f2fs_filename *fname)
+void fscrypt_free_filename(struct fscrypt_name *fname)
{
kfree(fname->crypto_buf.name);
fname->crypto_buf.name = NULL;
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
}
+EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..06f5aa478bf2
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,272 @@
+/*
+ * key management facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <linux/fscrypto.h>
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->res = rc;
+ complete(&ecr->completion);
+}
+
+/**
+ * derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key: Source key to which to apply derivation.
+ * @derived_key: Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
+ u8 source_key[FS_AES_256_XTS_KEY_SIZE],
+ u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+{
+ int res = 0;
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ skcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ derive_crypt_complete, &ecr);
+ res = crypto_skcipher_setkey(tfm, deriving_key,
+ FS_AES_128_ECB_KEY_SIZE);
+ if (res < 0)
+ goto out;
+
+ sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
+ sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ FS_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_skcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+out:
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm);
+ return res;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ key_put(ci->ci_keyring_key);
+ crypto_free_skcipher(ci->ci_ctfm);
+ kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int get_crypt_info(struct inode *inode)
+{
+ struct fscrypt_info *crypt_info;
+ u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+ (FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
+ struct key *keyring_key = NULL;
+ struct fscrypt_key *master_key;
+ struct fscrypt_context ctx;
+ const struct user_key_payload *ukp;
+ struct crypto_skcipher *ctfm;
+ const char *cipher_str;
+ u8 raw_key[FS_MAX_KEY_SIZE];
+ u8 mode;
+ int res;
+
+ res = fscrypt_initialize();
+ if (res)
+ return res;
+
+ if (!inode->i_sb->s_cop->get_context)
+ return -EOPNOTSUPP;
+retry:
+ crypt_info = ACCESS_ONCE(inode->i_crypt_info);
+ if (crypt_info) {
+ if (!crypt_info->ci_keyring_key ||
+ key_validate(crypt_info->ci_keyring_key) == 0)
+ return 0;
+ fscrypt_put_encryption_info(inode, crypt_info);
+ goto retry;
+ }
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ if (!fscrypt_dummy_context_enabled(inode))
+ return res;
+ ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ } else if (res != sizeof(ctx)) {
+ return -EINVAL;
+ }
+ res = 0;
+
+ crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_flags = ctx.flags;
+ crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+ crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+ crypt_info->ci_ctfm = NULL;
+ crypt_info->ci_keyring_key = NULL;
+ memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+ sizeof(crypt_info->ci_master_key));
+ if (S_ISREG(inode->i_mode))
+ mode = crypt_info->ci_data_mode;
+ else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ mode = crypt_info->ci_filename_mode;
+ else
+ BUG();
+
+ switch (mode) {
+ case FS_ENCRYPTION_MODE_AES_256_XTS:
+ cipher_str = "xts(aes)";
+ break;
+ case FS_ENCRYPTION_MODE_AES_256_CTS:
+ cipher_str = "cts(cbc(aes))";
+ break;
+ default:
+ printk_once(KERN_WARNING
+ "%s: unsupported key mode %d (ino %u)\n",
+ __func__, mode, (unsigned) inode->i_ino);
+ res = -ENOKEY;
+ goto out;
+ }
+ if (fscrypt_dummy_context_enabled(inode)) {
+ memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+ goto got_key;
+ }
+ memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
+ FS_KEY_DESC_PREFIX_SIZE);
+ sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
+ "%*phN", FS_KEY_DESCRIPTOR_SIZE,
+ ctx.master_key_descriptor);
+ full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+ (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ if (IS_ERR(keyring_key)) {
+ res = PTR_ERR(keyring_key);
+ keyring_key = NULL;
+ goto out;
+ }
+ crypt_info->ci_keyring_key = keyring_key;
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "%s: key type must be logon\n", __func__);
+ res = -ENOKEY;
+ goto out;
+ }
+ down_read(&keyring_key->sem);
+ ukp = user_key_payload(keyring_key);
+ if (ukp->datalen != sizeof(struct fscrypt_key)) {
+ res = -EINVAL;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ master_key = (struct fscrypt_key *)ukp->data;
+ BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+ if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "%s: key size incorrect: %d\n",
+ __func__, master_key->size);
+ res = -ENOKEY;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
+ up_read(&keyring_key->sem);
+ if (res)
+ goto out;
+got_key:
+ ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
+ if (!ctfm || IS_ERR(ctfm)) {
+ res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+ printk(KERN_DEBUG
+ "%s: error %d (inode %u) allocating crypto tfm\n",
+ __func__, res, (unsigned) inode->i_ino);
+ goto out;
+ }
+ crypt_info->ci_ctfm = ctfm;
+ crypto_skcipher_clear_flags(ctfm, ~0);
+ crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+ if (res)
+ goto out;
+
+ memzero_explicit(raw_key, sizeof(raw_key));
+ if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
+ put_crypt_info(crypt_info);
+ goto retry;
+ }
+ return 0;
+
+out:
+ if (res == -ENOKEY)
+ res = 0;
+ put_crypt_info(crypt_info);
+ memzero_explicit(raw_key, sizeof(raw_key));
+ return res;
+}
+
+void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+{
+ struct fscrypt_info *prev;
+
+ if (ci == NULL)
+ ci = ACCESS_ONCE(inode->i_crypt_info);
+ if (ci == NULL)
+ return;
+
+ prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
+ if (prev != ci)
+ return;
+
+ put_crypt_info(ci);
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ if (!ci ||
+ (ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD)))))
+ return get_crypt_info(inode);
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..0f9961eede1e
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,229 @@
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/fscrypto.h>
+
+static int inode_has_encryption_context(struct inode *inode)
+{
+ if (!inode->i_sb->s_cop->get_context)
+ return 0;
+ return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int is_encryption_context_consistent_with_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+ int res;
+
+ if (!inode->i_sb->s_cop->get_context)
+ return 0;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res != sizeof(ctx))
+ return 0;
+
+ return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx.flags == policy->flags) &&
+ (ctx.contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx.filenames_encryption_mode ==
+ policy->filenames_encryption_mode));
+}
+
+static int create_encryption_context_from_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+ int res;
+
+ if (!inode->i_sb->s_cop->set_context)
+ return -EOPNOTSUPP;
+
+ if (inode->i_sb->s_cop->prepare_context) {
+ res = inode->i_sb->s_cop->prepare_context(inode);
+ if (res)
+ return res;
+ }
+
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+
+ if (!fscrypt_valid_contents_enc_mode(
+ policy->contents_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid contents encryption mode %d\n", __func__,
+ policy->contents_encryption_mode);
+ return -EINVAL;
+ }
+
+ if (!fscrypt_valid_filenames_enc_mode(
+ policy->filenames_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid filenames encryption mode %d\n", __func__,
+ policy->filenames_encryption_mode);
+ return -EINVAL;
+ }
+
+ if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+ return -EINVAL;
+
+ ctx.contents_encryption_mode = policy->contents_encryption_mode;
+ ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+ ctx.flags = policy->flags;
+ BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
+ get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+}
+
+int fscrypt_process_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ if (policy->version != 0)
+ return -EINVAL;
+
+ if (!inode_has_encryption_context(inode)) {
+ if (!inode->i_sb->s_cop->empty_dir)
+ return -EOPNOTSUPP;
+ if (!inode->i_sb->s_cop->empty_dir(inode))
+ return -ENOTEMPTY;
+ return create_encryption_context_from_policy(inode, policy);
+ }
+
+ if (is_encryption_context_consistent_with_policy(inode, policy))
+ return 0;
+
+ printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+ __func__);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(fscrypt_process_policy);
+
+int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+ int res;
+
+ if (!inode->i_sb->s_cop->get_context ||
+ !inode->i_sb->s_cop->is_encrypted(inode))
+ return -ENODATA;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res != sizeof(ctx))
+ return -ENODATA;
+ if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+
+ policy->version = 0;
+ policy->contents_encryption_mode = ctx.contents_encryption_mode;
+ policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy->flags = ctx.flags;
+ memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_policy);
+
+int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+ struct fscrypt_info *parent_ci, *child_ci;
+ int res;
+
+ if ((parent == NULL) || (child == NULL)) {
+ printk(KERN_ERR "parent %p child %p\n", parent, child);
+ BUG_ON(1);
+ }
+
+ /* no restrictions if the parent directory is not encrypted */
+ if (!parent->i_sb->s_cop->is_encrypted(parent))
+ return 1;
+ /* if the child directory is not encrypted, this is always a problem */
+ if (!parent->i_sb->s_cop->is_encrypted(child))
+ return 0;
+ res = fscrypt_get_encryption_info(parent);
+ if (res)
+ return 0;
+ res = fscrypt_get_encryption_info(child);
+ if (res)
+ return 0;
+ parent_ci = parent->i_crypt_info;
+ child_ci = child->i_crypt_info;
+ if (!parent_ci && !child_ci)
+ return 1;
+ if (!parent_ci || !child_ci)
+ return 0;
+
+ return (memcmp(parent_ci->ci_master_key,
+ child_ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags));
+}
+EXPORT_SYMBOL(fscrypt_has_permitted_context);
+
+/**
+ * fscrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child: Child inode that inherits the context from @parent.
+ * @fs_data: private data given by FS.
+ * @preload: preload child i_crypt_info
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int fscrypt_inherit_context(struct inode *parent, struct inode *child,
+ void *fs_data, bool preload)
+{
+ struct fscrypt_context ctx;
+ struct fscrypt_info *ci;
+ int res;
+
+ if (!parent->i_sb->s_cop->set_context)
+ return -EOPNOTSUPP;
+
+ res = fscrypt_get_encryption_info(parent);
+ if (res < 0)
+ return res;
+
+ ci = parent->i_crypt_info;
+ if (ci == NULL)
+ return -ENOKEY;
+
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ if (fscrypt_dummy_context_enabled(parent)) {
+ ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+ res = 0;
+ } else {
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE);
+ }
+ get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ res = parent->i_sb->s_cop->set_context(child, &ctx,
+ sizeof(ctx), fs_data);
+ if (res)
+ return res;
+ return preload ? fscrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/dax.c b/fs/dax.c
index 711172450da6..90322eb7498c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -286,8 +286,13 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
inode_unlock(inode);
- if ((retval > 0) && end_io)
- end_io(iocb, pos, retval, bh.b_private);
+ if (end_io) {
+ int err;
+
+ err = end_io(iocb, pos, retval, bh.b_private);
+ if (err)
+ retval = err;
+ }
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
@@ -1056,6 +1061,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
+ int error;
/*
* We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1065,7 +1071,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* saves us from having to make a call to get_block() here to look
* up the sector.
*/
- dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+ error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+ true);
+
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (error)
+ return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/fs/dcache.c b/fs/dcache.c
index 92d5140de851..32ceae3e6112 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
unsigned flags;
dentry->d_inode = inode;
- smp_wmb();
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
WRITE_ONCE(dentry->d_flags, flags);
}
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
- smp_wmb();
dentry->d_inode = NULL;
}
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -1756,12 +1745,12 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
unsigned add_flags = d_flags_for_inode(inode);
spin_lock(&dentry->d_lock);
- if (inode)
- hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
+ __fsnotify_d_instantiate(dentry);
spin_unlock(&dentry->d_lock);
- fsnotify_d_instantiate(dentry, inode);
}
/**
@@ -1782,91 +1771,16 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
void d_instantiate(struct dentry *entry, struct inode * inode)
{
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
- if (inode)
+ if (inode) {
spin_lock(&inode->i_lock);
- __d_instantiate(entry, inode);
- if (inode)
+ __d_instantiate(entry, inode);
spin_unlock(&inode->i_lock);
+ }
security_d_instantiate(entry, inode);
}
EXPORT_SYMBOL(d_instantiate);
/**
- * d_instantiate_unique - instantiate a non-aliased dentry
- * @entry: dentry to instantiate
- * @inode: inode to attach to this dentry
- *
- * Fill in inode information in the entry. On success, it returns NULL.
- * If an unhashed alias of "entry" already exists, then we return the
- * aliased dentry instead and drop one reference to inode.
- *
- * Note that in order to avoid conflicts with rename() etc, the caller
- * had better be holding the parent directory semaphore.
- *
- * This also assumes that the inode count has been incremented
- * (or otherwise set) by the caller to indicate that it is now
- * in use by the dcache.
- */
-static struct dentry *__d_instantiate_unique(struct dentry *entry,
- struct inode *inode)
-{
- struct dentry *alias;
- int len = entry->d_name.len;
- const char *name = entry->d_name.name;
- unsigned int hash = entry->d_name.hash;
-
- if (!inode) {
- __d_instantiate(entry, NULL);
- return NULL;
- }
-
- hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
- /*
- * Don't need alias->d_lock here, because aliases with
- * d_parent == entry->d_parent are not subject to name or
- * parent changes, because the parent inode i_mutex is held.
- */
- if (alias->d_name.hash != hash)
- continue;
- if (alias->d_parent != entry->d_parent)
- continue;
- if (alias->d_name.len != len)
- continue;
- if (dentry_cmp(alias, name, len))
- continue;
- __dget(alias);
- return alias;
- }
-
- __d_instantiate(entry, inode);
- return NULL;
-}
-
-struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
-{
- struct dentry *result;
-
- BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-
- if (inode)
- spin_lock(&inode->i_lock);
- result = __d_instantiate_unique(entry, inode);
- if (inode)
- spin_unlock(&inode->i_lock);
-
- if (!result) {
- security_d_instantiate(entry, inode);
- return NULL;
- }
-
- BUG_ON(!d_unhashed(result));
- iput(inode);
- return result;
-}
-
-EXPORT_SYMBOL(d_instantiate_unique);
-
-/**
* d_instantiate_no_diralias - instantiate a non-aliased dentry
* @entry: dentry to complete
* @inode: inode to attach to this dentry
@@ -2446,6 +2360,86 @@ void d_rehash(struct dentry * entry)
}
EXPORT_SYMBOL(d_rehash);
+
+/* inode->i_lock held if inode is non-NULL */
+
+static inline void __d_add(struct dentry *dentry, struct inode *inode)
+{
+ if (inode) {
+ __d_instantiate(dentry, inode);
+ spin_unlock(&inode->i_lock);
+ }
+ security_d_instantiate(dentry, inode);
+ d_rehash(dentry);
+}
+
+/**
+ * d_add - add dentry to hash queues
+ * @entry: dentry to add
+ * @inode: The inode to attach to this dentry
+ *
+ * This adds the entry to the hash queues and initializes @inode.
+ * The entry was actually filled in earlier during d_alloc().
+ */
+
+void d_add(struct dentry *entry, struct inode *inode)
+{
+ if (inode)
+ spin_lock(&inode->i_lock);
+ __d_add(entry, inode);
+}
+EXPORT_SYMBOL(d_add);
+
+/**
+ * d_exact_alias - find and hash an exact unhashed alias
+ * @entry: dentry to add
+ * @inode: The inode to go with this dentry
+ *
+ * If an unhashed dentry with the same name/parent and desired
+ * inode already exists, hash and return it. Otherwise, return
+ * NULL.
+ *
+ * Parent directory should be locked.
+ */
+struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
+{
+ struct dentry *alias;
+ int len = entry->d_name.len;
+ const char *name = entry->d_name.name;
+ unsigned int hash = entry->d_name.hash;
+
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ /*
+ * Don't need alias->d_lock here, because aliases with
+ * d_parent == entry->d_parent are not subject to name or
+ * parent changes, because the parent inode i_mutex is held.
+ */
+ if (alias->d_name.hash != hash)
+ continue;
+ if (alias->d_parent != entry->d_parent)
+ continue;
+ if (alias->d_name.len != len)
+ continue;
+ if (dentry_cmp(alias, name, len))
+ continue;
+ spin_lock(&alias->d_lock);
+ if (!d_unhashed(alias)) {
+ spin_unlock(&alias->d_lock);
+ alias = NULL;
+ } else {
+ __dget_dlock(alias);
+ _d_rehash(alias);
+ spin_unlock(&alias->d_lock);
+ }
+ spin_unlock(&inode->i_lock);
+ return alias;
+ }
+ spin_unlock(&inode->i_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(d_exact_alias);
+
/**
* dentry_update_name_case - update case insensitive dentry with a new name
* @dentry: dentry to be updated
@@ -2782,10 +2776,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
BUG_ON(!d_unhashed(dentry));
- if (!inode) {
- __d_instantiate(dentry, NULL);
+ if (!inode)
goto out;
- }
+
spin_lock(&inode->i_lock);
if (S_ISDIR(inode->i_mode)) {
struct dentry *new = __d_find_any_alias(inode);
@@ -2819,12 +2812,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
return new;
}
}
- /* already taking inode->i_lock, so d_add() by hand */
- __d_instantiate(dentry, inode);
- spin_unlock(&inode->i_lock);
out:
- security_d_instantiate(dentry, inode);
- d_rehash(dentry);
+ __d_add(dentry, inode);
return NULL;
}
EXPORT_SYMBOL(d_splice_alias);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d6a9012d42ad..476f1ecbd1f0 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -253,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result)
- dio->end_io(dio->iocb, offset, transferred, dio->private);
+ if (dio->end_io) {
+ int err;
+
+ err = dio->end_io(dio->iocb, offset, ret, dio->private);
+ if (err)
+ ret = err;
+ }
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(dio->inode);
@@ -445,7 +450,8 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
+ !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8e294fbbac39..1669f6291c95 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -343,24 +343,20 @@ static struct config_group *make_cluster(struct config_group *g,
struct dlm_cluster *cl = NULL;
struct dlm_spaces *sps = NULL;
struct dlm_comms *cms = NULL;
- void *gps = NULL;
cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
- gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
- if (!cl || !gps || !sps || !cms)
+ if (!cl || !sps || !cms)
goto fail;
config_group_init_type_name(&cl->group, name, &cluster_type);
config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
- cl->group.default_groups = gps;
- cl->group.default_groups[0] = &sps->ss_group;
- cl->group.default_groups[1] = &cms->cs_group;
- cl->group.default_groups[2] = NULL;
+ configfs_add_default_group(&sps->ss_group, &cl->group);
+ configfs_add_default_group(&cms->cs_group, &cl->group);
cl->cl_tcp_port = dlm_config.ci_tcp_port;
cl->cl_buffer_size = dlm_config.ci_buffer_size;
@@ -383,7 +379,6 @@ static struct config_group *make_cluster(struct config_group *g,
fail:
kfree(cl);
- kfree(gps);
kfree(sps);
kfree(cms);
return ERR_PTR(-ENOMEM);
@@ -392,14 +387,8 @@ static struct config_group *make_cluster(struct config_group *g,
static void drop_cluster(struct config_group *g, struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
- struct config_item *tmp;
- int j;
- for (j = 0; cl->group.default_groups[j]; j++) {
- tmp = &cl->group.default_groups[j]->cg_item;
- cl->group.default_groups[j] = NULL;
- config_item_put(tmp);
- }
+ configfs_remove_default_groups(&cl->group);
space_list = NULL;
comm_list = NULL;
@@ -410,7 +399,6 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
static void release_cluster(struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
- kfree(cl->group.default_groups);
kfree(cl);
}
@@ -418,21 +406,17 @@ static struct config_group *make_space(struct config_group *g, const char *name)
{
struct dlm_space *sp = NULL;
struct dlm_nodes *nds = NULL;
- void *gps = NULL;
sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
- gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
- if (!sp || !gps || !nds)
+ if (!sp || !nds)
goto fail;
config_group_init_type_name(&sp->group, name, &space_type);
- config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
- sp->group.default_groups = gps;
- sp->group.default_groups[0] = &nds->ns_group;
- sp->group.default_groups[1] = NULL;
+ config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+ configfs_add_default_group(&nds->ns_group, &sp->group);
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
@@ -441,7 +425,6 @@ static struct config_group *make_space(struct config_group *g, const char *name)
fail:
kfree(sp);
- kfree(gps);
kfree(nds);
return ERR_PTR(-ENOMEM);
}
@@ -449,24 +432,16 @@ static struct config_group *make_space(struct config_group *g, const char *name)
static void drop_space(struct config_group *g, struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
- struct config_item *tmp;
- int j;
/* assert list_empty(&sp->members) */
- for (j = 0; sp->group.default_groups[j]; j++) {
- tmp = &sp->group.default_groups[j]->cg_item;
- sp->group.default_groups[j] = NULL;
- config_item_put(tmp);
- }
-
+ configfs_remove_default_groups(&sp->group);
config_item_put(i);
}
static void release_space(struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
- kfree(sp->group.default_groups);
kfree(sp);
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3a37bd3f9637..00640e70ed7a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -124,7 +124,10 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
- void (*orig_error_report)(struct sock *sk);
+ void (*orig_error_report)(struct sock *);
+ void (*orig_data_ready)(struct sock *);
+ void (*orig_state_change)(struct sock *);
+ void (*orig_write_space)(struct sock *);
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -467,16 +470,24 @@ int dlm_lowcomms_connect_node(int nodeid)
static void lowcomms_error_report(struct sock *sk)
{
- struct connection *con = sock2con(sk);
+ struct connection *con;
struct sockaddr_storage saddr;
+ int buflen;
+ void (*orig_report)(struct sock *) = NULL;
- if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) {
+ read_lock_bh(&sk->sk_callback_lock);
+ con = sock2con(sk);
+ if (con == NULL)
+ goto out;
+
+ orig_report = con->orig_error_report;
+ if (con->sock == NULL ||
+ kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
"sending to node %d, port %d, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
con->nodeid, dlm_config.ci_tcp_port,
sk->sk_err, sk->sk_err_soft);
- return;
} else if (saddr.ss_family == AF_INET) {
struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
@@ -499,22 +510,54 @@ static void lowcomms_error_report(struct sock *sk)
dlm_config.ci_tcp_port, sk->sk_err,
sk->sk_err_soft);
}
- con->orig_error_report(sk);
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (orig_report)
+ orig_report(sk);
+}
+
+/* Note: sk_callback_lock must be locked before calling this function. */
+static void save_callbacks(struct connection *con, struct sock *sk)
+{
+ lock_sock(sk);
+ con->orig_data_ready = sk->sk_data_ready;
+ con->orig_state_change = sk->sk_state_change;
+ con->orig_write_space = sk->sk_write_space;
+ con->orig_error_report = sk->sk_error_report;
+ release_sock(sk);
+}
+
+static void restore_callbacks(struct connection *con, struct sock *sk)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
+ sk->sk_user_data = NULL;
+ sk->sk_data_ready = con->orig_data_ready;
+ sk->sk_state_change = con->orig_state_change;
+ sk->sk_write_space = con->orig_write_space;
+ sk->sk_error_report = con->orig_error_report;
+ release_sock(sk);
+ write_unlock_bh(&sk->sk_callback_lock);
}
/* Make a socket active */
static void add_sock(struct socket *sock, struct connection *con)
{
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
con->sock = sock;
+ sk->sk_user_data = con;
+ if (!test_bit(CF_IS_OTHERCON, &con->flags))
+ save_callbacks(con, sk);
/* Install a data_ready callback */
- con->sock->sk->sk_data_ready = lowcomms_data_ready;
- con->sock->sk->sk_write_space = lowcomms_write_space;
- con->sock->sk->sk_state_change = lowcomms_state_change;
- con->sock->sk->sk_user_data = con;
- con->sock->sk->sk_allocation = GFP_NOFS;
- con->orig_error_report = con->sock->sk->sk_error_report;
- con->sock->sk->sk_error_report = lowcomms_error_report;
+ sk->sk_data_ready = lowcomms_data_ready;
+ sk->sk_write_space = lowcomms_write_space;
+ sk->sk_state_change = lowcomms_state_change;
+ sk->sk_allocation = GFP_NOFS;
+ sk->sk_error_report = lowcomms_error_report;
+ write_unlock_bh(&sk->sk_callback_lock);
}
/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -549,6 +592,8 @@ static void close_connection(struct connection *con, bool and_other,
mutex_lock(&con->sock_mutex);
if (con->sock) {
+ if (!test_bit(CF_IS_OTHERCON, &con->flags))
+ restore_callbacks(con, con->sock->sk);
sock_release(con->sock);
con->sock = NULL;
}
@@ -1190,6 +1235,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
if (result < 0) {
log_print("Failed to set SO_REUSEADDR on socket: %d", result);
}
+ sock->sk->sk_user_data = con;
+
con->rx_action = tcp_accept_from_sock;
con->connect_action = tcp_connect_to_sock;
@@ -1271,6 +1318,7 @@ static int sctp_listen_for_all(void)
if (result < 0)
log_print("Could not set SCTP NODELAY error %d\n", result);
+ write_lock_bh(&sock->sk->sk_callback_lock);
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1278,6 +1326,8 @@ static int sctp_listen_for_all(void)
con->rx_action = sctp_accept_from_sock;
con->connect_action = sctp_connect_to_sock;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
/* Bind to all addresses. */
if (sctp_bind_addrs(con, dlm_config.ci_tcp_port))
goto create_delsock;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 80d6901493cf..64026e53722a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -23,6 +23,8 @@
* 02111-1307, USA.
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
@@ -30,7 +32,6 @@
#include <linux/compiler.h>
#include <linux/key.h>
#include <linux/namei.h>
-#include <linux/crypto.h>
#include <linux/file.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
@@ -74,6 +75,19 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
}
}
+static int ecryptfs_hash_digest(struct crypto_shash *tfm,
+ char *src, int len, char *dst)
+{
+ SHASH_DESC_ON_STACK(desc, tfm);
+ int err;
+
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = crypto_shash_digest(desc, src, len, dst);
+ shash_desc_zero(desc);
+ return err;
+}
+
/**
* ecryptfs_calculate_md5 - calculates the md5 of @src
* @dst: Pointer to 16 bytes of allocated memory
@@ -88,45 +102,26 @@ static int ecryptfs_calculate_md5(char *dst,
struct ecryptfs_crypt_stat *crypt_stat,
char *src, int len)
{
- struct scatterlist sg;
- struct hash_desc desc = {
- .tfm = crypt_stat->hash_tfm,
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct crypto_shash *tfm;
int rc = 0;
mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
- sg_init_one(&sg, (u8 *)src, len);
- if (!desc.tfm) {
- desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm)) {
- rc = PTR_ERR(desc.tfm);
+ tfm = crypt_stat->hash_tfm;
+ if (!tfm) {
+ tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
+ if (IS_ERR(tfm)) {
+ rc = PTR_ERR(tfm);
ecryptfs_printk(KERN_ERR, "Error attempting to "
"allocate crypto context; rc = [%d]\n",
rc);
goto out;
}
- crypt_stat->hash_tfm = desc.tfm;
- }
- rc = crypto_hash_init(&desc);
- if (rc) {
- printk(KERN_ERR
- "%s: Error initializing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
+ crypt_stat->hash_tfm = tfm;
}
- rc = crypto_hash_update(&desc, &sg, len);
+ rc = ecryptfs_hash_digest(tfm, src, len, dst);
if (rc) {
printk(KERN_ERR
- "%s: Error updating crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
- rc = crypto_hash_final(&desc, dst);
- if (rc) {
- printk(KERN_ERR
- "%s: Error finalizing crypto hash; rc = [%d]\n",
+ "%s: Error computing crypto hash; rc = [%d]\n",
__func__, rc);
goto out;
}
@@ -234,10 +229,8 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
{
struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
- if (crypt_stat->tfm)
- crypto_free_ablkcipher(crypt_stat->tfm);
- if (crypt_stat->hash_tfm)
- crypto_free_hash(crypt_stat->hash_tfm);
+ crypto_free_skcipher(crypt_stat->tfm);
+ crypto_free_shash(crypt_stat->hash_tfm);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
@@ -342,7 +335,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct scatterlist *src_sg, int size,
unsigned char *iv, int op)
{
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
struct extent_crypt_result ecr;
int rc = 0;
@@ -358,20 +351,20 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
init_completion(&ecr.completion);
mutex_lock(&crypt_stat->cs_tfm_mutex);
- req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+ req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
if (!req) {
mutex_unlock(&crypt_stat->cs_tfm_mutex);
rc = -ENOMEM;
goto out;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
extent_crypt_complete, &ecr);
/* Consider doing this once, when the file is opened */
if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
- rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
- crypt_stat->key_size);
+ rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+ crypt_stat->key_size);
if (rc) {
ecryptfs_printk(KERN_ERR,
"Error setting key; rc = [%d]\n",
@@ -383,9 +376,9 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->flags |= ECRYPTFS_KEY_SET;
}
mutex_unlock(&crypt_stat->cs_tfm_mutex);
- ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
- rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
- crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+ rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) :
+ crypto_skcipher_decrypt(req);
if (rc == -EINPROGRESS || rc == -EBUSY) {
struct extent_crypt_result *ecr = req->base.data;
@@ -394,7 +387,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
reinit_completion(&ecr->completion);
}
out:
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
return rc;
}
@@ -622,7 +615,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
crypt_stat->cipher, "cbc");
if (rc)
goto out_unlock;
- crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
+ crypt_stat->tfm = crypto_alloc_skcipher(full_alg_name, 0, 0);
if (IS_ERR(crypt_stat->tfm)) {
rc = PTR_ERR(crypt_stat->tfm);
crypt_stat->tfm = NULL;
@@ -631,7 +624,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -1499,16 +1492,14 @@ out:
*/
static int
ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
- struct ecryptfs_crypt_stat *crypt_stat,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
{
int rc = 0;
filename->encrypted_filename = NULL;
filename->encrypted_filename_size = 0;
- if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
- || (mount_crypt_stat && (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+ if (mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
size_t packet_size;
size_t remaining_bytes;
@@ -1591,7 +1582,7 @@ out:
* event, regardless of whether this function succeeds for fails.
*/
static int
-ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
+ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
char *cipher_name, size_t *key_size)
{
char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
@@ -1609,21 +1600,18 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
"ecb");
if (rc)
goto out;
- *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
+ *key_tfm = crypto_alloc_skcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(*key_tfm)) {
rc = PTR_ERR(*key_tfm);
printk(KERN_ERR "Unable to allocate crypto cipher with name "
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- if (*key_size == 0) {
- struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm);
-
- *key_size = alg->max_keysize;
- }
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ if (*key_size == 0)
+ *key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
- rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
+ rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size);
if (rc) {
printk(KERN_ERR "Error attempting to set key of size [%zd] for "
"cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
@@ -1660,8 +1648,7 @@ int ecryptfs_destroy_crypto(void)
list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list,
key_tfm_list) {
list_del(&key_tfm->key_tfm_list);
- if (key_tfm->key_tfm)
- crypto_free_blkcipher(key_tfm->key_tfm);
+ crypto_free_skcipher(key_tfm->key_tfm);
kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm);
}
mutex_unlock(&key_tfm_list_mutex);
@@ -1747,7 +1734,7 @@ int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm)
* Searches for cached item first, and creates new if not found.
* Returns 0 on success, non-zero if adding new cipher failed
*/
-int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
struct mutex **tfm_mutex,
char *cipher_name)
{
@@ -1944,7 +1931,6 @@ out:
int ecryptfs_encrypt_and_encode_filename(
char **encoded_name,
size_t *encoded_name_size,
- struct ecryptfs_crypt_stat *crypt_stat,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size)
{
@@ -1953,9 +1939,8 @@ int ecryptfs_encrypt_and_encode_filename(
(*encoded_name) = NULL;
(*encoded_name_size) = 0;
- if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
- || (mount_crypt_stat && (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+ if (mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
struct ecryptfs_filename *filename;
filename = kzalloc(sizeof(*filename), GFP_KERNEL);
@@ -1968,8 +1953,7 @@ int ecryptfs_encrypt_and_encode_filename(
}
filename->filename = (char *)name;
filename->filename_size = name_size;
- rc = ecryptfs_encrypt_filename(filename, crypt_stat,
- mount_crypt_stat);
+ rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt "
"filename; rc = [%d]\n", __func__, rc);
@@ -1980,11 +1964,9 @@ int ecryptfs_encrypt_and_encode_filename(
NULL, &encoded_name_no_prefix_size,
filename->encrypted_filename,
filename->encrypted_filename_size);
- if ((crypt_stat && (crypt_stat->flags
- & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
- || (mount_crypt_stat
+ if (mount_crypt_stat
&& (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))
(*encoded_name_size) =
(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+ encoded_name_no_prefix_size);
@@ -2002,11 +1984,9 @@ int ecryptfs_encrypt_and_encode_filename(
kfree(filename);
goto out;
}
- if ((crypt_stat && (crypt_stat->flags
- & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
- || (mount_crypt_stat
+ if (mount_crypt_stat
&& (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
memcpy((*encoded_name),
ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
@@ -2120,7 +2100,7 @@ out:
int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
{
- struct blkcipher_desc desc;
+ struct crypto_skcipher *tfm;
struct mutex *tfm_mutex;
size_t cipher_blocksize;
int rc;
@@ -2130,7 +2110,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
return 0;
}
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
mount_crypt_stat->global_default_fn_cipher_name);
if (unlikely(rc)) {
(*namelen) = 0;
@@ -2138,7 +2118,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
}
mutex_lock(tfm_mutex);
- cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+ cipher_blocksize = crypto_skcipher_blocksize(tfm);
mutex_unlock(tfm_mutex);
/* Return an exact amount for the common cases */
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 7b39260c7bba..d123fbaa28e0 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -28,6 +28,7 @@
#ifndef ECRYPTFS_KERNEL_H
#define ECRYPTFS_KERNEL_H
+#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
#include <linux/fs.h>
@@ -38,7 +39,6 @@
#include <linux/nsproxy.h>
#include <linux/backing-dev.h>
#include <linux/ecryptfs.h>
-#include <linux/crypto.h>
#define ECRYPTFS_DEFAULT_IV_BYTES 16
#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
@@ -233,9 +233,9 @@ struct ecryptfs_crypt_stat {
size_t extent_shift;
unsigned int extent_mask;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct crypto_ablkcipher *tfm;
- struct crypto_hash *hash_tfm; /* Crypto context for generating
- * the initialization vectors */
+ struct crypto_skcipher *tfm;
+ struct crypto_shash *hash_tfm; /* Crypto context for generating
+ * the initialization vectors */
unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -309,7 +309,7 @@ struct ecryptfs_global_auth_tok {
* keeps a list of crypto API contexts around to use when needed.
*/
struct ecryptfs_key_tfm {
- struct crypto_blkcipher *key_tfm;
+ struct crypto_skcipher *key_tfm;
size_t key_size;
struct mutex key_tfm_mutex;
struct list_head key_tfm_list;
@@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
int ecryptfs_encrypt_and_encode_filename(
char **encoded_name,
size_t *encoded_name_size,
- struct ecryptfs_crypt_stat *crypt_stat,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size);
struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
@@ -659,7 +658,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
int ecryptfs_init_crypto(void);
int ecryptfs_destroy_crypto(void);
int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm);
-int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
struct mutex **tfm_mutex,
char *cipher_name);
int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4e685ac1024d..121114e9a464 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -29,7 +29,6 @@
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/mount.h>
-#include <linux/crypto.h>
#include <linux/fs_stack.h>
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -397,11 +396,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
int rc = 0;
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
- inode_lock(d_inode(lower_dir_dentry));
- lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+ lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name,
lower_dir_dentry,
ecryptfs_dentry->d_name.len);
- inode_unlock(d_inode(lower_dir_dentry));
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -419,18 +416,16 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
dput(lower_dentry);
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
- NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+ mount_crypt_stat, ecryptfs_dentry->d_name.name,
ecryptfs_dentry->d_name.len);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt and encode "
"filename; rc = [%d]\n", __func__, rc);
goto out;
}
- inode_lock(d_inode(lower_dir_dentry));
- lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+ lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name,
lower_dir_dentry,
encrypted_and_encoded_name_size);
- inode_unlock(d_inode(lower_dir_dentry));
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -502,7 +497,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
dir->i_sb)->mount_crypt_stat;
rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
&encoded_symlen,
- NULL,
mount_crypt_stat, symname,
strlen(symname));
if (rc)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 6bd67e2011f0..9893d1538122 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -25,11 +25,12 @@
* 02111-1307, USA.
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/key.h>
#include <linux/random.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include "ecryptfs_kernel.h"
@@ -601,12 +602,13 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
struct ecryptfs_auth_tok *auth_tok;
struct scatterlist src_sg[2];
struct scatterlist dst_sg[2];
- struct blkcipher_desc desc;
+ struct crypto_skcipher *skcipher_tfm;
+ struct skcipher_request *skcipher_req;
char iv[ECRYPTFS_MAX_IV_BYTES];
char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- struct hash_desc hash_desc;
- struct scatterlist hash_sg;
+ struct crypto_shash *hash_tfm;
+ struct shash_desc *hash_desc;
};
/**
@@ -629,14 +631,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
struct key *auth_tok_key = NULL;
int rc = 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
"[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
- rc = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
(*packet_size) = 0;
rc = ecryptfs_find_auth_tok_for_sig(
&auth_tok_key,
@@ -649,7 +649,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
goto out;
}
rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
- &s->desc.tfm,
+ &s->skcipher_tfm,
&s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
if (unlikely(rc)) {
printk(KERN_ERR "Internal error whilst attempting to get "
@@ -658,7 +658,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
goto out;
}
mutex_lock(s->tfm_mutex);
- s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+ s->block_size = crypto_skcipher_blocksize(s->skcipher_tfm);
/* Plus one for the \0 separator between the random prefix
* and the plaintext filename */
s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
@@ -691,6 +691,19 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc = -EINVAL;
goto out_unlock;
}
+
+ s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
+ if (!s->skcipher_req) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "skcipher_request_alloc for %s\n", __func__,
+ crypto_skcipher_driver_name(s->skcipher_tfm));
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+
+ skcipher_request_set_callback(s->skcipher_req,
+ CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
GFP_KERNEL);
if (!s->block_aligned_filename) {
@@ -700,7 +713,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc = -ENOMEM;
goto out_unlock;
}
- s->i = 0;
dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
rc = ecryptfs_write_packet_length(&dest[s->i],
(ECRYPTFS_SIG_SIZE
@@ -738,40 +750,36 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"password tokens\n", __func__);
goto out_free_unlock;
}
- sg_init_one(
- &s->hash_sg,
- (u8 *)s->auth_tok->token.password.session_key_encryption_key,
- s->auth_tok->token.password.session_key_encryption_key_bytes);
- s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(s->hash_desc.tfm)) {
- rc = PTR_ERR(s->hash_desc.tfm);
+ s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
+ if (IS_ERR(s->hash_tfm)) {
+ rc = PTR_ERR(s->hash_tfm);
printk(KERN_ERR "%s: Error attempting to "
"allocate hash crypto context; rc = [%d]\n",
__func__, rc);
goto out_free_unlock;
}
- rc = crypto_hash_init(&s->hash_desc);
- if (rc) {
- printk(KERN_ERR
- "%s: Error initializing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out_release_free_unlock;
- }
- rc = crypto_hash_update(
- &s->hash_desc, &s->hash_sg,
- s->auth_tok->token.password.session_key_encryption_key_bytes);
- if (rc) {
- printk(KERN_ERR
- "%s: Error updating crypto hash; rc = [%d]\n",
- __func__, rc);
+
+ s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
+ crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
+ if (!s->hash_desc) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "kmalloc [%zd] bytes\n", __func__,
+ sizeof(*s->hash_desc) +
+ crypto_shash_descsize(s->hash_tfm));
+ rc = -ENOMEM;
goto out_release_free_unlock;
}
- rc = crypto_hash_final(&s->hash_desc, s->hash);
+
+ s->hash_desc->tfm = s->hash_tfm;
+ s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ rc = crypto_shash_digest(s->hash_desc,
+ (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+ s->auth_tok->token.password.session_key_encryption_key_bytes,
+ s->hash);
if (rc) {
printk(KERN_ERR
- "%s: Error finalizing crypto hash; rc = [%d]\n",
+ "%s: Error computing crypto hash; rc = [%d]\n",
__func__, rc);
goto out_release_free_unlock;
}
@@ -780,27 +788,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
== (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
- sg_init_one(&s->hash_sg, (u8 *)s->hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
- rc = crypto_hash_init(&s->hash_desc);
- if (rc) {
- printk(KERN_ERR
- "%s: Error initializing crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
+ rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
+ ECRYPTFS_TAG_70_DIGEST_SIZE,
+ s->tmp_hash);
if (rc) {
printk(KERN_ERR
- "%s: Error updating crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error finalizing crypto hash; "
+ "%s: Error computing crypto hash; "
"rc = [%d]\n", __func__, rc);
goto out_release_free_unlock;
}
@@ -834,10 +827,8 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
* of the IV here, so we just use 0's for the IV. Note the
* constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
* >= ECRYPTFS_MAX_IV_BYTES. */
- memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
- s->desc.info = s->iv;
- rc = crypto_blkcipher_setkey(
- s->desc.tfm,
+ rc = crypto_skcipher_setkey(
+ s->skcipher_tfm,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
if (rc < 0) {
@@ -850,8 +841,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
goto out_release_free_unlock;
}
- rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
- s->block_aligned_filename_size);
+ skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
+ s->block_aligned_filename_size, s->iv);
+ rc = crypto_skcipher_encrypt(s->skcipher_req);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt filename; "
"rc = [%d]\n", __func__, rc);
@@ -861,7 +853,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
(*packet_size) = s->i;
(*remaining_bytes) -= (*packet_size);
out_release_free_unlock:
- crypto_free_hash(s->hash_desc.tfm);
+ crypto_free_shash(s->hash_tfm);
out_free_unlock:
kzfree(s->block_aligned_filename);
out_unlock:
@@ -871,6 +863,8 @@ out:
up_write(&(auth_tok_key->sem));
key_put(auth_tok_key);
}
+ skcipher_request_free(s->skcipher_req);
+ kzfree(s->hash_desc);
kfree(s);
return rc;
}
@@ -888,7 +882,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
struct ecryptfs_auth_tok *auth_tok;
struct scatterlist src_sg[2];
struct scatterlist dst_sg[2];
- struct blkcipher_desc desc;
+ struct crypto_skcipher *skcipher_tfm;
+ struct skcipher_request *skcipher_req;
char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
char iv[ECRYPTFS_MAX_IV_BYTES];
char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
@@ -922,14 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
(*packet_size) = 0;
(*filename_size) = 0;
(*filename) = NULL;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
"[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
- rc = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
"at least [%d]\n", __func__, max_packet_size,
@@ -992,7 +985,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
rc);
goto out;
}
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->skcipher_tfm,
&s->tfm_mutex,
s->cipher_string);
if (unlikely(rc)) {
@@ -1030,12 +1023,23 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
__func__, rc, s->block_aligned_filename_size);
goto out_free_unlock;
}
+
+ s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
+ if (!s->skcipher_req) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "skcipher_request_alloc for %s\n", __func__,
+ crypto_skcipher_driver_name(s->skcipher_tfm));
+ rc = -ENOMEM;
+ goto out_free_unlock;
+ }
+
+ skcipher_request_set_callback(s->skcipher_req,
+ CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
/* The characters in the first block effectively do the job of
* the IV here, so we just use 0's for the IV. Note the
* constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
* >= ECRYPTFS_MAX_IV_BYTES. */
- memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
- s->desc.info = s->iv;
/* TODO: Support other key modules than passphrase for
* filename encryption */
if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -1044,8 +1048,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
"password tokens\n", __func__);
goto out_free_unlock;
}
- rc = crypto_blkcipher_setkey(
- s->desc.tfm,
+ rc = crypto_skcipher_setkey(
+ s->skcipher_tfm,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
if (rc < 0) {
@@ -1058,14 +1062,14 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
goto out_free_unlock;
}
- rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
- s->block_aligned_filename_size);
+ skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
+ s->block_aligned_filename_size, s->iv);
+ rc = crypto_skcipher_decrypt(s->skcipher_req);
if (rc) {
printk(KERN_ERR "%s: Error attempting to decrypt filename; "
"rc = [%d]\n", __func__, rc);
goto out_free_unlock;
}
- s->i = 0;
while (s->decrypted_filename[s->i] != '\0'
&& s->i < s->block_aligned_filename_size)
s->i++;
@@ -1108,6 +1112,7 @@ out:
up_write(&(auth_tok_key->sem));
key_put(auth_tok_key);
}
+ skcipher_request_free(s->skcipher_req);
kfree(s);
return rc;
}
@@ -1667,9 +1672,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
struct scatterlist dst_sg[2];
struct scatterlist src_sg[2];
struct mutex *tfm_mutex;
- struct blkcipher_desc desc = {
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct crypto_skcipher *tfm;
+ struct skcipher_request *req = NULL;
int rc = 0;
if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1680,7 +1684,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
auth_tok->token.password.session_key_encryption_key,
auth_tok->token.password.session_key_encryption_key_bytes);
}
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
crypt_stat->cipher);
if (unlikely(rc)) {
printk(KERN_ERR "Internal error whilst attempting to get "
@@ -1711,8 +1715,20 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
goto out;
}
mutex_lock(tfm_mutex);
- rc = crypto_blkcipher_setkey(
- desc.tfm, auth_tok->token.password.session_key_encryption_key,
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ mutex_unlock(tfm_mutex);
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "skcipher_request_alloc for %s\n", __func__,
+ crypto_skcipher_driver_name(tfm));
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ rc = crypto_skcipher_setkey(
+ tfm, auth_tok->token.password.session_key_encryption_key,
crypt_stat->key_size);
if (unlikely(rc < 0)) {
mutex_unlock(tfm_mutex);
@@ -1720,8 +1736,10 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
rc = -EINVAL;
goto out;
}
- rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
- auth_tok->session_key.encrypted_key_size);
+ skcipher_request_set_crypt(req, src_sg, dst_sg,
+ auth_tok->session_key.encrypted_key_size,
+ NULL);
+ rc = crypto_skcipher_decrypt(req);
mutex_unlock(tfm_mutex);
if (unlikely(rc)) {
printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc);
@@ -1738,6 +1756,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
crypt_stat->key_size);
}
out:
+ skcipher_request_free(req);
return rc;
}
@@ -2191,16 +2210,14 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
size_t max_packet_size;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
crypt_stat->mount_crypt_stat;
- struct blkcipher_desc desc = {
- .tfm = NULL,
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct crypto_skcipher *tfm;
+ struct skcipher_request *req;
int rc = 0;
(*packet_size) = 0;
ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature,
ECRYPTFS_SIG_SIZE);
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
crypt_stat->cipher);
if (unlikely(rc)) {
printk(KERN_ERR "Internal error whilst attempting to get "
@@ -2209,12 +2226,11 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
goto out;
}
if (mount_crypt_stat->global_default_cipher_key_size == 0) {
- struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm);
-
printk(KERN_WARNING "No key size specified at mount; "
- "defaulting to [%d]\n", alg->max_keysize);
+ "defaulting to [%d]\n",
+ crypto_skcipher_default_keysize(tfm));
mount_crypt_stat->global_default_cipher_key_size =
- alg->max_keysize;
+ crypto_skcipher_default_keysize(tfm);
}
if (crypt_stat->key_size == 0)
crypt_stat->key_size =
@@ -2284,20 +2300,36 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
goto out;
}
mutex_lock(tfm_mutex);
- rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key,
- crypt_stat->key_size);
+ rc = crypto_skcipher_setkey(tfm, session_key_encryption_key,
+ crypt_stat->key_size);
if (rc < 0) {
mutex_unlock(tfm_mutex);
ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
"context; rc = [%d]\n", rc);
goto out;
}
+
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ mutex_unlock(tfm_mutex);
+ ecryptfs_printk(KERN_ERR, "Out of kernel memory whilst "
+ "attempting to skcipher_request_alloc for "
+ "%s\n", crypto_skcipher_driver_name(tfm));
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+
rc = 0;
ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
crypt_stat->key_size);
- rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
- (*key_rec).enc_key_size);
+ skcipher_request_set_crypt(req, src_sg, dst_sg,
+ (*key_rec).enc_key_size, NULL);
+ rc = crypto_skcipher_encrypt(req);
mutex_unlock(tfm_mutex);
+ skcipher_request_free(req);
if (rc) {
printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc);
goto out;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e25b6b06bacf..8b0b4a73116d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -29,7 +29,6 @@
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/skbuff.h>
-#include <linux/crypto.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/key.h>
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index c6ced4cbf0cf..1f5865263b3e 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -30,7 +30,6 @@
#include <linux/page-flags.h>
#include <linux/mount.h>
#include <linux/file.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index afa1b81c3418..77a486d3a51b 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
-#include <linux/crypto.h>
#include <linux/statfs.h>
#include <linux/magic.h>
#include "ecryptfs_kernel.h"
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ed70cf9fdc7b..1231cd1999d8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
u64 count;
poll_wait(file, &ctx->wqh, wait);
- smp_rmb();
- count = ctx->count;
+
+ /*
+ * All writes to ctx->count occur within ctx->wqh.lock. This read
+ * can be done outside ctx->wqh.lock because we know that poll_wait
+ * takes that lock (through add_wait_queue) if our caller will sleep.
+ *
+ * The read _can_ therefore seep into add_wait_queue's critical
+ * section, but cannot move above it! add_wait_queue's spin_lock acts
+ * as an acquire barrier and ensures that the read be ordered properly
+ * against the writes. The following CAN happen and is safe:
+ *
+ * poll write
+ * ----------------- ------------
+ * lock ctx->wqh.lock (in poll_wait)
+ * count = ctx->count
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * if (waitqueue_active)
+ * wake_up_locked_poll
+ * unlock ctx->qwh.lock
+ * eventfd_poll returns 0
+ *
+ * but the following, which would miss a wakeup, cannot happen:
+ *
+ * poll write
+ * ----------------- ------------
+ * count = ctx->count (INVALID!)
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * **waitqueue_active is false**
+ * **no wake_up_locked_poll!**
+ * unlock ctx->qwh.lock
+ * lock ctx->wqh.lock (in poll_wait)
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * eventfd_poll returns 0
+ */
+ count = READ_ONCE(ctx->count);
if (count > 0)
events |= POLLIN;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cde60741cad2..8a74a2a52e0f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
- long slack = 0;
+ u64 slack = 0;
wait_queue_t wait;
ktime_t expires, *to = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index dcd4ac7d3f1e..c4010b8207a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -198,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
return NULL;
}
#endif
- ret = get_user_pages(current, bprm->mm, pos,
- 1, write, 1, &page, NULL);
+ /*
+ * We are doing an exec(). 'current' is the process
+ * doing the exec and bprm->mm is the new process's mm.
+ */
+ ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
+ 1, &page, NULL);
if (ret <= 0)
return NULL;
@@ -831,6 +836,97 @@ int kernel_read(struct file *file, loff_t offset,
EXPORT_SYMBOL(kernel_read);
+int kernel_read_file(struct file *file, void **buf, loff_t *size,
+ loff_t max_size, enum kernel_read_file_id id)
+{
+ loff_t i_size, pos;
+ ssize_t bytes = 0;
+ int ret;
+
+ if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
+ return -EINVAL;
+
+ ret = security_kernel_read_file(file, id);
+ if (ret)
+ return ret;
+
+ i_size = i_size_read(file_inode(file));
+ if (max_size > 0 && i_size > max_size)
+ return -EFBIG;
+ if (i_size <= 0)
+ return -EINVAL;
+
+ *buf = vmalloc(i_size);
+ if (!*buf)
+ return -ENOMEM;
+
+ pos = 0;
+ while (pos < i_size) {
+ bytes = kernel_read(file, pos, (char *)(*buf) + pos,
+ i_size - pos);
+ if (bytes < 0) {
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != i_size) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = security_kernel_post_read_file(file, *buf, i_size, id);
+ if (!ret)
+ *size = pos;
+
+out:
+ if (ret < 0) {
+ vfree(*buf);
+ *buf = NULL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file);
+
+int kernel_read_file_from_path(char *path, void **buf, loff_t *size,
+ loff_t max_size, enum kernel_read_file_id id)
+{
+ struct file *file;
+ int ret;
+
+ if (!path || !*path)
+ return -EINVAL;
+
+ file = filp_open(path, O_RDONLY, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = kernel_read_file(file, buf, size, max_size, id);
+ fput(file);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
+
+int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
+ enum kernel_read_file_id id)
+{
+ struct fd f = fdget(fd);
+ int ret = -EBADF;
+
+ if (!f.file)
+ goto out;
+
+ ret = kernel_read_file(f.file, buf, size, max_size, id);
+out:
+ fdput(f);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
+
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4c69c94cafd8..170939f379d7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
+struct mb_cache;
+
/*
* second extended-fs super-block data in memory
*/
@@ -111,6 +113,7 @@ struct ext2_sb_info {
* of the mount options.
*/
spinlock_t s_lock;
+ struct mb_cache *s_mb_cache;
};
static inline spinlock_t *
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 2a188413a2b0..b78caf25f746 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- ext2_xattr_put_super(sb);
+ if (sbi->s_mb_cache) {
+ ext2_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (!(sb->s_flags & MS_RDONLY)) {
struct ext2_super_block *es = sbi->s_es;
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_msg(sb, KERN_ERR, "error: insufficient memory");
goto failed_mount3;
}
+
+#ifdef CONFIG_EXT2_FS_XATTR
+ sbi->s_mb_cache = ext2_xattr_create_cache();
+ if (!sbi->s_mb_cache) {
+ ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount3;
+ }
+#endif
/*
* set up enough so that it can read an inode
*/
@@ -1149,6 +1160,8 @@ cantfind_ext2:
sb->s_id);
goto failed_mount;
failed_mount3:
+ if (sbi->s_mb_cache)
+ ext2_xattr_destroy_cache(sbi->s_mb_cache);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
static int __init init_ext2_fs(void)
{
- int err = init_ext2_xattr();
- if (err)
- return err;
+ int err;
+
err = init_inodecache();
if (err)
- goto out1;
+ return err;
err = register_filesystem(&ext2_fs_type);
if (err)
goto out;
return 0;
out:
destroy_inodecache();
-out1:
- exit_ext2_xattr();
return err;
}
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
{
unregister_filesystem(&ext2_fs_type);
destroy_inodecache();
- exit_ext2_xattr();
}
MODULE_AUTHOR("Remy Card and others");
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f57a7aba32eb..1a5e3bff0b63 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -90,14 +90,12 @@
static int ext2_xattr_set2(struct inode *, struct buffer_head *,
struct ext2_xattr_header *);
-static int ext2_xattr_cache_insert(struct buffer_head *);
+static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
static struct buffer_head *ext2_xattr_cache_find(struct inode *,
struct ext2_xattr_header *);
static void ext2_xattr_rehash(struct ext2_xattr_header *,
struct ext2_xattr_entry *);
-static struct mb_cache *ext2_xattr_cache;
-
static const struct xattr_handler *ext2_xattr_handler_map[] = {
[EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
size_t name_len, size;
char *end;
int error;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -196,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
goto found;
entry = next;
}
- if (ext2_xattr_cache_insert(bh))
+ if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
ea_idebug(inode, "cache insert failed");
error = -ENODATA;
goto cleanup;
@@ -209,7 +208,7 @@ found:
le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
goto bad_block;
- if (ext2_xattr_cache_insert(bh))
+ if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
ea_idebug(inode, "cache insert failed");
if (buffer) {
error = -ERANGE;
@@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
char *end;
size_t rest = buffer_size;
int error;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -281,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
goto bad_block;
entry = next;
}
- if (ext2_xattr_cache_insert(bh))
+ if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
ea_idebug(inode, "cache insert failed");
/* list the attribute names */
@@ -483,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
/* Here we know that we can set the new attribute. */
if (header) {
- struct mb_cache_entry *ce;
-
/* assert(header == HDR(bh)); */
- ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
- bh->b_blocknr);
lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) {
+ __u32 hash = le32_to_cpu(header->h_hash);
+
ea_bdebug(bh, "modifying in-place");
- if (ce)
- mb_cache_entry_free(ce);
+ /*
+ * This must happen under buffer lock for
+ * ext2_xattr_set2() to reliably detect modified block
+ */
+ mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
+ hash, bh->b_blocknr);
+
/* keep the buffer locked while modifying it. */
} else {
int offset;
- if (ce)
- mb_cache_entry_release(ce);
unlock_buffer(bh);
ea_bdebug(bh, "cloning");
header = kmalloc(bh->b_size, GFP_KERNEL);
@@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh = NULL;
int error;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
if (header) {
new_bh = ext2_xattr_cache_find(inode, header);
@@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
don't need to change the reference count. */
new_bh = old_bh;
get_bh(new_bh);
- ext2_xattr_cache_insert(new_bh);
+ ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
} else {
/* We need to allocate a new block */
ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
memcpy(new_bh->b_data, header, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext2_xattr_cache_insert(new_bh);
+ ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
ext2_xattr_update_super_block(sb);
}
@@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
error = 0;
if (old_bh && old_bh != new_bh) {
- struct mb_cache_entry *ce;
-
/*
* If there was an old block and we are no longer using it,
* release the old block.
*/
- ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
- old_bh->b_blocknr);
lock_buffer(old_bh);
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+ __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for
+ * ext2_xattr_set2() to reliably detect freed block
+ */
+ mb_cache_entry_delete_block(ext2_mb_cache,
+ hash, old_bh->b_blocknr);
/* Free the old block. */
- if (ce)
- mb_cache_entry_free(ce);
ea_bdebug(old_bh, "freeing");
ext2_free_blocks(inode, old_bh->b_blocknr, 1);
mark_inode_dirty(inode);
@@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
} else {
/* Decrement the refcount only. */
le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
dquot_free_block_nodirty(inode, 1);
mark_inode_dirty(inode);
mark_buffer_dirty(old_bh);
@@ -757,7 +759,6 @@ void
ext2_xattr_delete_inode(struct inode *inode)
{
struct buffer_head *bh = NULL;
- struct mb_cache_entry *ce;
down_write(&EXT2_I(inode)->xattr_sem);
if (!EXT2_I(inode)->i_file_acl)
@@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode)
EXT2_I(inode)->i_file_acl);
goto cleanup;
}
- ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
lock_buffer(bh);
if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
- if (ce)
- mb_cache_entry_free(ce);
+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for ext2_xattr_set2() to
+ * reliably detect freed block
+ */
+ mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
+ hash, bh->b_blocknr);
ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
get_bh(bh);
bforget(bh);
unlock_buffer(bh);
} else {
le32_add_cpu(&HDR(bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
ea_bdebug(bh, "refcount now=%d",
le32_to_cpu(HDR(bh)->h_refcount));
unlock_buffer(bh);
@@ -806,18 +810,6 @@ cleanup:
}
/*
- * ext2_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext2_xattr_put_super(struct super_block *sb)
-{
- mb_cache_shrink(sb->s_bdev);
-}
-
-
-/*
* ext2_xattr_cache_insert()
*
* Create a new entry in the extended attribute cache, and insert
@@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb)
* Returns 0, or a negative error number on failure.
*/
static int
-ext2_xattr_cache_insert(struct buffer_head *bh)
+ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
{
__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
- struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
- if (!ce)
- return -ENOMEM;
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+ error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
if (error) {
- mb_cache_entry_free(ce);
if (error == -EBUSY) {
ea_bdebug(bh, "already in cache (%d cache entries)",
atomic_read(&ext2_xattr_cache->c_entry_count));
error = 0;
}
- } else {
- ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
- atomic_read(&ext2_xattr_cache->c_entry_count));
- mb_cache_entry_release(ce);
- }
+ } else
+ ea_bdebug(bh, "inserting [%x]", (int)hash);
return error;
}
@@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
- hash);
+ ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
while (ce) {
struct buffer_head *bh;
- if (IS_ERR(ce)) {
- if (PTR_ERR(ce) == -EAGAIN)
- goto again;
- break;
- }
-
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_cache_find",
@@ -927,7 +905,21 @@ again:
inode->i_ino, (unsigned long) ce->e_block);
} else {
lock_buffer(bh);
- if (le32_to_cpu(HDR(bh)->h_refcount) >
+ /*
+ * We have to be careful about races with freeing or
+ * rehashing of xattr block. Once we hold buffer lock
+ * xattr block's state is stable so we can check
+ * whether the block got freed / rehashed or not.
+ * Since we unhash mbcache entry under buffer lock when
+ * freeing / rehashing xattr block, checking whether
+ * entry is still hashed is reliable.
+ */
+ if (hlist_bl_unhashed(&ce->e_hash_list)) {
+ mb_cache_entry_put(ext2_mb_cache, ce);
+ unlock_buffer(bh);
+ brelse(bh);
+ goto again;
+ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
(unsigned long) ce->e_block,
@@ -936,13 +928,14 @@ again:
} else if (!ext2_xattr_cmp(header, HDR(bh))) {
ea_bdebug(bh, "b_count=%d",
atomic_read(&(bh->b_count)));
- mb_cache_entry_release(ce);
+ mb_cache_entry_touch(ext2_mb_cache, ce);
+ mb_cache_entry_put(ext2_mb_cache, ce);
return bh;
}
unlock_buffer(bh);
brelse(bh);
}
- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
}
return NULL;
}
@@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
#undef BLOCK_HASH_SHIFT
-int __init
-init_ext2_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb_cache *ext2_xattr_create_cache(void)
{
- ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
- if (!ext2_xattr_cache)
- return -ENOMEM;
- return 0;
+ return mb_cache_create(HASH_BUCKET_BITS);
}
-void
-exit_ext2_xattr(void)
+void ext2_xattr_destroy_cache(struct mb_cache *cache)
{
- mb_cache_destroy(ext2_xattr_cache);
+ if (cache)
+ mb_cache_destroy(cache);
}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 60edf298644e..6f82ab1b00ca 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
#define EXT2_XATTR_SIZE(size) \
(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
+struct mb_cache;
+
# ifdef CONFIG_EXT2_FS_XATTR
extern const struct xattr_handler ext2_xattr_user_handler;
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern void ext2_xattr_delete_inode(struct inode *);
-extern void ext2_xattr_put_super(struct super_block *);
-extern int init_ext2_xattr(void);
-extern void exit_ext2_xattr(void);
+extern struct mb_cache *ext2_xattr_create_cache(void);
+extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
extern const struct xattr_handler *ext2_xattr_handlers[];
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
{
}
-static inline void
-ext2_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext2_xattr(void)
-{
- return 0;
-}
-
-static inline void
-exit_ext2_xattr(void)
+static inline void ext2_xattr_destroy_cache(struct mb_cache *cache)
{
}
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 38f7562489bb..edc053a81914 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -18,11 +18,9 @@
* Special Publication 800-38E and IEEE P1619/D16.
*/
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
#include <linux/ecryptfs.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
@@ -261,21 +259,21 @@ static int ext4_page_crypto(struct inode *inode,
{
u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct scatterlist dst, src;
struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n",
__func__);
return -ENOMEM;
}
- ablkcipher_request_set_callback(
+ skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
ext4_crypt_complete, &ecr);
@@ -288,21 +286,21 @@ static int ext4_page_crypto(struct inode *inode,
sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
- ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
- xts_tweak);
+ skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+ xts_tweak);
if (rw == EXT4_DECRYPT)
- res = crypto_ablkcipher_decrypt(req);
+ res = crypto_skcipher_decrypt(req);
else
- res = crypto_ablkcipher_encrypt(req);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res) {
printk_ratelimited(
KERN_ERR
- "%s: crypto_ablkcipher_encrypt() returned %d\n",
+ "%s: crypto_skcipher_encrypt() returned %d\n",
__func__, res);
return res;
}
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 2fbef8a14760..1a2f360405db 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -11,11 +11,9 @@
*
*/
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/skcipher.h>
#include <keys/encrypted-type.h>
#include <keys/user-type.h>
-#include <linux/crypto.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/key.h>
@@ -65,10 +63,10 @@ static int ext4_fname_encrypt(struct inode *inode,
struct ext4_str *oname)
{
u32 ciphertext_len;
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
char iv[EXT4_CRYPTO_BLOCK_SIZE];
struct scatterlist src_sg, dst_sg;
@@ -95,14 +93,14 @@ static int ext4_fname_encrypt(struct inode *inode,
}
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(
KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
kfree(alloc_buf);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
ext4_dir_crypt_complete, &ecr);
@@ -117,14 +115,14 @@ static int ext4_fname_encrypt(struct inode *inode,
/* Create encryption request */
sg_init_one(&src_sg, workbuf, ciphertext_len);
sg_init_one(&dst_sg, oname->name, ciphertext_len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
- res = crypto_ablkcipher_encrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
kfree(alloc_buf);
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(
KERN_ERR "%s: Error (error code %d)\n", __func__, res);
@@ -145,11 +143,11 @@ static int ext4_fname_decrypt(struct inode *inode,
struct ext4_str *oname)
{
struct ext4_str tmp_in[2], tmp_out[1];
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct scatterlist src_sg, dst_sg;
struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
char iv[EXT4_CRYPTO_BLOCK_SIZE];
unsigned lim = max_name_len(inode);
@@ -162,13 +160,13 @@ static int ext4_fname_decrypt(struct inode *inode,
tmp_out[0].name = oname->name;
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(
KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
ext4_dir_crypt_complete, &ecr);
@@ -178,13 +176,13 @@ static int ext4_fname_decrypt(struct inode *inode,
/* Create encryption request */
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_skcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(
KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 9a16d1e75a49..0129d688d1f7 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -8,6 +8,7 @@
* Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
*/
+#include <crypto/skcipher.h>
#include <keys/encrypted-type.h>
#include <keys/user-type.h>
#include <linux/random.h>
@@ -41,45 +42,42 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
{
int res = 0;
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct scatterlist src_sg, dst_sg;
- struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
- 0);
+ struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
if (IS_ERR(tfm)) {
res = PTR_ERR(tfm);
tfm = NULL;
goto out;
}
- crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
goto out;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
derive_crypt_complete, &ecr);
- res = crypto_ablkcipher_setkey(tfm, deriving_key,
- EXT4_AES_128_ECB_KEY_SIZE);
+ res = crypto_skcipher_setkey(tfm, deriving_key,
+ EXT4_AES_128_ECB_KEY_SIZE);
if (res < 0)
goto out;
sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
- EXT4_AES_256_XTS_KEY_SIZE, NULL);
- res = crypto_ablkcipher_encrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ EXT4_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
out:
- if (req)
- ablkcipher_request_free(req);
- if (tfm)
- crypto_free_ablkcipher(tfm);
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm);
return res;
}
@@ -90,7 +88,7 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci)
if (ci->ci_keyring_key)
key_put(ci->ci_keyring_key);
- crypto_free_ablkcipher(ci->ci_ctfm);
+ crypto_free_skcipher(ci->ci_ctfm);
kmem_cache_free(ext4_crypt_info_cachep, ci);
}
@@ -122,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
struct ext4_encryption_context ctx;
const struct user_key_payload *ukp;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct crypto_ablkcipher *ctfm;
+ struct crypto_skcipher *ctfm;
const char *cipher_str;
char raw_key[EXT4_MAX_KEY_SIZE];
char mode;
@@ -237,7 +235,7 @@ retry:
if (res)
goto out;
got_key:
- ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+ ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
printk(KERN_DEBUG
@@ -246,11 +244,11 @@ got_key:
goto out;
}
crypt_info->ci_ctfm = ctfm;
- crypto_ablkcipher_clear_flags(ctfm, ~0);
- crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+ crypto_skcipher_clear_flags(ctfm, ~0);
+ crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm),
CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_ablkcipher_setkey(ctfm, raw_key,
- ext4_encryption_key_size(mode));
+ res = crypto_skcipher_setkey(ctfm, raw_key,
+ ext4_encryption_key_size(mode));
if (res)
goto out;
memzero_explicit(raw_key, sizeof(raw_key));
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 33f5e2a50cf8..50ba27cbed03 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -285,7 +285,7 @@ errout:
static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
- return is_compat_task();
+ return in_compat_syscall();
#else
return (BITS_PER_LONG == 32);
#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 157b458a69d4..c04743519865 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -42,6 +42,18 @@
*/
/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
* Define EXT4FS_DEBUG to produce debug messages
*/
#undef EXT4FS_DEBUG
@@ -182,9 +194,9 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
+ atomic_t count; /* reference counter */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -1024,13 +1036,8 @@ struct ext4_inode_info {
* transaction reserved
*/
struct list_head i_rsv_conversion_list;
- /*
- * Completed IOs that need unwritten extents handling and don't have
- * transaction reserved
- */
- atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1504,25 +1511,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
- struct ext4_io_end *io_end)
-{
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_unwritten);
- }
-}
-
-static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
-{
- return inode->i_private;
-}
-
-static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
-{
- inode->i_private = io;
-}
-
/*
* Inode dynamic state flags
*/
@@ -2506,12 +2494,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+ struct buffer_head *bh_result, int create);
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2559,6 +2549,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int map_len,
+ struct extent_status *result);
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -3285,15 +3278,33 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
EXT4_WQ_HASH_SZ])
-#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
#define EXT4_RESIZING 0
extern int ext4_resize_begin(struct super_block *sb);
extern void ext4_resize_end(struct super_block *sb);
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+ struct ext4_io_end *io_end)
+{
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ io_end->flag |= EXT4_IO_END_UNWRITTEN;
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
+ }
+}
+
+static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+ struct inode *inode = io_end->inode;
+
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
+ }
+}
+
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index ac7d4e813796..1f73c29717e1 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -77,7 +77,7 @@ struct ext4_crypt_info {
char ci_data_mode;
char ci_filename_mode;
char ci_flags;
- struct crypto_ablkcipher *ci_ctfm;
+ struct crypto_skcipher *ci_ctfm;
struct key *ci_keyring_key;
char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
};
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 3c9381547094..8ecf84b8f5a1 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3753ceb0b0dd..95bf4679ac54 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
+ /*
+ * The check for IO to unwritten extent is somewhat racy as we
+ * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
+ * dropping i_data_sem. But reserved blocks should save us in that
+ * case.
+ */
if (ext4_ext_is_unwritten(ex1) &&
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
+ * ext4_ext_determine_hole - determine hole around given block
+ * @inode: inode we lookup in
+ * @path: path in extent tree to @lblk
+ * @lblk: pointer to logical block around which we want to determine hole
+ *
+ * Determine hole length (and start if easily possible) around given logical
+ * block. We don't try too hard to find the beginning of the hole but @path
+ * actually points to extent before @lblk, we provide it.
+ *
+ * The function returns the length of a hole starting at @lblk. We update @lblk
+ * to the beginning of the hole if we managed to find it.
*/
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
- ext4_lblk_t block)
+static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *lblk)
{
int depth = ext_depth(inode);
- ext4_lblk_t len;
- ext4_lblk_t lblock;
struct ext4_extent *ex;
- struct extent_status es;
+ ext4_lblk_t len;
ex = path[depth].p_ext;
if (ex == NULL) {
/* there is no extent yet, so gap is [0;-] */
- lblock = 0;
+ *lblk = 0;
len = EXT_MAX_BLOCKS;
- ext_debug("cache gap(whole file):");
- } else if (block < le32_to_cpu(ex->ee_block)) {
- lblock = block;
- len = le32_to_cpu(ex->ee_block) - block;
- ext_debug("cache gap(before): %u [%u:%u]",
- block,
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex));
- } else if (block >= le32_to_cpu(ex->ee_block)
+ } else if (*lblk < le32_to_cpu(ex->ee_block)) {
+ len = le32_to_cpu(ex->ee_block) - *lblk;
+ } else if (*lblk >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
- lblock = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
+ *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
next = ext4_ext_next_allocated_block(path);
- ext_debug("cache gap(after): [%u:%u] %u",
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex),
- block);
- BUG_ON(next == lblock);
- len = next - lblock;
+ BUG_ON(next == *lblk);
+ len = next - *lblk;
} else {
BUG();
}
+ return len;
+}
- ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
+ ext4_lblk_t hole_len)
+{
+ struct extent_status es;
+
+ ext4_es_find_delayed_extent_range(inode, hole_start,
+ hole_start + hole_len - 1, &es);
if (es.es_len) {
/* There's delayed extent containing lblock? */
- if (es.es_lblk <= lblock)
+ if (es.es_lblk <= hole_start)
return;
- len = min(es.es_lblk - lblock, len);
+ hole_len = min(es.es_lblk - hole_start, hole_len);
}
- ext_debug(" -> %u:%u\n", lblock, len);
- ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
+ ext_debug(" -> %u:%u\n", hole_start, hole_len);
+ ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
+ EXTENT_STATUS_HOLE);
}
/*
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath, int flags,
+ struct ext4_ext_path **ppath,
unsigned int allocated)
{
struct ext4_ext_path *path = *ppath;
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path = *ppath;
int ret = 0;
int err = 0;
- ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0)
goto out;
- /*
- * Flag the inode(non aio case) or end_io struct (aio case)
- * that this IO needs to conversion to written when IO is
- * completed
- */
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
- int set_unwritten = 0;
bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
allocated = convert_initialized_extent(
handle, inode, map, &path,
- flags, allocated);
+ allocated);
goto out2;
} else if (!ext4_ext_is_unwritten(ex))
goto out;
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* we couldn't try to create block if create flag is zero
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ ext4_lblk_t hole_start, hole_len;
+
+ hole_start = map->m_lblk;
+ hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
/*
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+
+ /* Update hole_len to reflect hole size after map->m_lblk */
+ if (hole_start != map->m_lblk)
+ hole_len -= map->m_lblk - hole_start;
+ map->m_pblk = 0;
+ map->m_len = min_t(unsigned int, map->m_len, hole_len);
+
goto out2;
}
@@ -4482,15 +4497,6 @@ got_allocated_blocks:
if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
- /*
- * io_end structure was created for every IO write to an
- * unwritten extent. To avoid unnecessary conversion,
- * here we flag the IO that really needs the conversion.
- * For non asycn direct IO case, flag the inode state
- * that we need to perform conversion when IO is done.
- */
- if (flags & EXT4_GET_BLOCKS_PRE_IO)
- set_unwritten = 1;
}
err = 0;
@@ -4501,14 +4507,6 @@ got_allocated_blocks:
err = ext4_ext_insert_extent(handle, inode, &path,
&newex, flags);
- if (!err && set_unwritten) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
-
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac748b3af1c1..e38b987ac7f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ out:
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
- if (!ext4_es_is_referenced(es))
- ext4_es_set_referenced(es);
+ if (!ext4_es_is_referenced(es1))
+ ext4_es_set_referenced(es1);
stats->es_stats_cache_hits++;
} else {
stats->es_stats_cache_misses++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4cd318f31cbe..6659e216385e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(iocb->ki_filp);
- struct mutex *aio_mutex = NULL;
struct blk_plug plug;
int o_direct = iocb->ki_flags & IOCB_DIRECT;
+ int unaligned_aio = 0;
int overwrite = 0;
ssize_t ret;
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
/*
- * Unaligned direct AIO must be serialized; see comment above
- * In the case of O_APPEND, assume that we must always serialize
+ * Unaligned direct AIO must be serialized among each other as zeroing
+ * of partial blocks of two competing unaligned AIOs can result in data
+ * corruption.
*/
- if (o_direct &&
- ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
- (iocb->ki_flags & IOCB_APPEND ||
- ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
- aio_mutex = ext4_aio_mutex(inode);
- mutex_lock(aio_mutex);
+ ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
+ unaligned_aio = 1;
ext4_unwritten_wait(inode);
}
- inode_lock(inode);
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_start_plug(&plug);
/* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (o_direct)
blk_finish_plug(&plug);
- if (aio_mutex)
- mutex_unlock(aio_mutex);
return ret;
out:
inode_unlock(inode);
- if (aio_mutex)
- mutex_unlock(aio_mutex);
return ret;
}
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
*/
static int ext4_find_unwritten_pgoff(struct inode *inode,
int whence,
- struct ext4_map_blocks *map,
+ ext4_lblk_t end_blk,
loff_t *offset)
{
struct pagevec pvec;
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)end_blk << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -550,12 +544,11 @@ out:
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t dataoff, isize;
int blkbits;
- int ret = 0;
+ int ret;
inode_lock(inode);
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
dataoff = offset;
do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
- break;
+ ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+ if (ret <= 0) {
+ /* No extent found -> no data */
+ if (ret == 0)
+ ret = -ENXIO;
+ inode_unlock(inode);
+ return ret;
}
- /*
- * If there is a delay extent at this offset,
- * it will be as a data.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ last = es.es_lblk;
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ if (!ext4_es_is_unwritten(&es))
break;
- }
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- &map, &dataoff);
- if (unwritten)
- break;
- }
-
- last++;
+ if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ es.es_lblk + es.es_len, &dataoff))
+ break;
+ last += es.es_len;
dataoff = (loff_t)last << blkbits;
+ cond_resched();
} while (last <= end);
inode_unlock(inode);
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t holeoff, isize;
int blkbits;
- int ret = 0;
+ int ret;
inode_lock(inode);
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
holeoff = offset;
do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
+ ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+ if (ret < 0) {
+ inode_unlock(inode);
+ return ret;
}
-
- /*
- * If there is a delay extent at this offset,
- * we will skip this extent.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- last = es.es_lblk + es.es_len;
- holeoff = (loff_t)last << blkbits;
- continue;
+ /* Found a hole? */
+ if (ret == 0 || es.es_lblk > last) {
+ if (last != start)
+ holeoff = (loff_t)last << blkbits;
+ break;
}
-
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- &map, &holeoff);
- if (!unwritten) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
- }
+ if (ext4_es_is_unwritten(&es) &&
+ ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ last + es.es_len, &holeoff))
+ break;
- /* find a hole */
- break;
+ last += es.es_len;
+ holeoff = (loff_t)last << blkbits;
+ cond_resched();
} while (last <= end);
inode_unlock(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index acc0ad56bf2f..237b877d316d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
sbi = EXT4_SB(sb);
/*
- * Initalize owners and quota early so that we don't have to account
+ * Initialize owners and quota early so that we don't have to account
* for quota initialization worst case in standard inode creating
* transaction
*/
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..3027fa681de5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
goto got_it;
}
- /* Next simple case - plain lookup or failed read of indirect block */
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+ /* Next simple case - plain lookup failed */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
+ int i;
+
+ /* Count number blocks in a subtree under 'partial' */
+ count = 1;
+ for (i = 0; partial + i != chain + depth - 1; i++)
+ count *= epb;
+ /* Fill in size of a hole we found */
+ map->m_pblk = 0;
+ map->m_len = min_t(unsigned int, map->m_len, count);
+ goto cleanup;
+ }
+
+ /* Failed read of indirect block */
+ if (err == -EIO)
goto cleanup;
/*
@@ -693,21 +708,21 @@ retry:
}
if (IS_DAX(inode))
ret = dax_do_io(iocb, inode, iter, offset,
- ext4_get_block, NULL, 0);
+ ext4_dio_get_block, NULL, 0);
else
ret = __blockdev_direct_IO(iocb, inode,
inode->i_sb->s_bdev, iter,
- offset, ext4_get_block, NULL,
- NULL, 0);
+ offset, ext4_dio_get_block,
+ NULL, NULL, 0);
inode_dio_end(inode);
} else {
locked:
if (IS_DAX(inode))
ret = dax_do_io(iocb, inode, iter, offset,
- ext4_get_block, NULL, DIO_LOCKING);
+ ext4_dio_get_block, NULL, DIO_LOCKING);
else
ret = blockdev_direct_IO(iocb, inode, iter, offset,
- ext4_get_block);
+ ext4_dio_get_block);
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index dfe3b9bafc0d..7cbdd3752ba5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -581,9 +581,10 @@ retry:
if (ret)
goto out;
- if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, from, to, ext4_get_block_write);
- else
+ if (ext4_should_dioread_nolock(inode)) {
+ ret = __block_write_begin(page, from, to,
+ ext4_get_block_unwritten);
+ } else
ret = __block_write_begin(page, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
if (err)
goto out;
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_mark_inode_dirty(handle, dir);
if (unlikely(err))
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index aee960b1af34..dab84a2530ff 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
}
truncate_inode_pages_final(&inode->i_data);
- WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);
- WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
-
/*
* Protect us against freezing - iput() caller didn't have to have any
* protection against it
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated.
- * if create==0 and the blocks are pre-allocated and unwritten block,
- * the result buffer head is unmapped. If the create ==1, it will make sure
- * the buffer head is mapped.
+ * On success, it returns the number of blocks being mapped or allocated. if
+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that case, buffer head is unmapped
+ * that case, @map is returned as unmapped but we still do fill map->m_len to
+ * indicate the length of a hole starting at map->m_lblk.
*
* It returns the error in case of allocation failure.
*/
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = map->m_len;
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ map->m_pblk = 0;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
retval = 0;
} else {
BUG_ON(1);
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
cmpxchg(&bh->b_state, old_state, new_state) != old_state));
}
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
-
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
- handle_t *handle = ext4_journal_current_handle();
struct ext4_map_blocks map;
- int ret = 0, started = 0;
- int dio_credits;
+ int ret = 0;
if (ext4_has_inline_data(inode))
return -ERANGE;
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
- /* Direct IO write... */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
- }
- started = 1;
- }
-
- ret = ext4_map_blocks(handle, inode, &map, flags);
+ ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
+ flags);
if (ret > 0) {
- ext4_io_end_t *io_end = ext4_inode_aio(inode);
-
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
- if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
- set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
- if (started)
- ext4_journal_stop(handle);
return ret;
}
@@ -769,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
}
/*
+ * Get block function used when preparing for buffered write if we require
+ * creating an unwritten extent if blocks haven't been allocated. The extent
+ * will be converted to written after the IO is complete.
+ */
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static handle_t *start_dio_trans(struct inode *inode,
+ struct buffer_head *bh_result)
+{
+ int dio_credits;
+
+ /* Trim mapping request to maximum we can map at once for DIO */
+ if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
+ bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ bh_result->b_size >> inode->i_blkbits);
+ return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+}
+
+/* Get block function for DIO reads and writes to inodes without extents */
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ handle_t *handle;
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ if (create) {
+ handle = start_dio_trans(inode, bh);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+ ret = _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+ if (create)
+ ext4_journal_stop(handle);
+ return ret;
+}
+
+/*
+ * Get block function for AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete.
+ */
+static int ext4_dio_get_block_unwritten_async(struct inode *inode,
+ sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ handle_t *handle;
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ handle = start_dio_trans(inode, bh_result);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ ext4_journal_stop(handle);
+
+ /*
+ * When doing DIO using unwritten extents, we need io_end to convert
+ * unwritten extents to written on IO completion. We allocate io_end
+ * once we spot unwritten extent and store it in b_private. Generic
+ * DIO code keeps b_private set and furthermore passes the value to
+ * our completion callback in 'private' argument.
+ */
+ if (!ret && buffer_unwritten(bh_result)) {
+ if (!bh_result->b_private) {
+ ext4_io_end_t *io_end;
+
+ io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!io_end)
+ return -ENOMEM;
+ bh_result->b_private = io_end;
+ ext4_set_io_unwritten_flag(inode, io_end);
+ }
+ set_buffer_defer_completion(bh_result);
+ }
+
+ return ret;
+}
+
+/*
+ * Get block function for non-AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete from ext4_ext_direct_IO() function.
+ */
+static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
+ sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ handle_t *handle;
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ handle = start_dio_trans(inode, bh_result);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ ext4_journal_stop(handle);
+
+ /*
+ * Mark inode as having pending DIO writes to unwritten extents.
+ * ext4_ext_direct_IO() checks this flag and converts extents to
+ * written.
+ */
+ if (!ret && buffer_unwritten(bh_result))
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+
+ return ret;
+}
+
+static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+
+ ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ ret = _ext4_get_block(inode, iblock, bh_result, 0);
+ /*
+ * Blocks should have been preallocated! ext4_file_write_iter() checks
+ * that.
+ */
+ WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
+
+ return ret;
+}
+
+
+/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -1079,13 +1206,14 @@ retry_journal:
#ifdef CONFIG_EXT4_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
- ext4_get_block_write);
+ ext4_get_block_unwritten);
else
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block);
#else
if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+ ret = __block_write_begin(page, pos, len,
+ ext4_get_block_unwritten);
else
ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-/*
- * ext4_get_block used when preparing for a DIO write or buffer write.
- * We allocate an uinitialized extent if blocks haven't been allocated.
- * The extent will be converted to initialized after the IO is complete.
- */
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
- inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-}
-
-static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
-
- ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
- inode->i_ino, create);
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result));
-
- return ret;
-}
-
#ifdef CONFIG_FS_DAX
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -3179,13 +3276,12 @@ out:
WARN_ON_ONCE(ret == 0 && create);
if (ret > 0) {
map_bh(bh_result, inode->i_sb, map.m_pblk);
- bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
- map.m_flags;
/*
* At least for now we have to clear BH_New so that DAX code
* doesn't attempt to zero blocks again in a racy way.
*/
- bh_result->b_state &= ~(1 << BH_New);
+ map.m_flags &= ~EXT4_MAP_NEW;
+ ext4_update_bh_state(bh_result, map.m_flags);
bh_result->b_size = map.m_len << inode->i_blkbits;
ret = 0;
}
@@ -3193,24 +3289,32 @@ out:
}
#endif
-static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
- ext4_io_end_t *io_end = iocb->private;
+ ext4_io_end_t *io_end = private;
/* if not async direct IO just return */
if (!io_end)
- return;
+ return 0;
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- iocb->private, io_end->inode->i_ino, iocb, offset,
- size);
+ io_end, io_end->inode->i_ino, iocb, offset, size);
- iocb->private = NULL;
+ /*
+ * Error during AIO DIO. We cannot convert unwritten extents as the
+ * data was not written. Just clear the unwritten flag and drop io_end.
+ */
+ if (size <= 0) {
+ ext4_clear_io_unwritten_flag(io_end);
+ size = 0;
+ }
io_end->offset = offset;
io_end->size = size;
ext4_put_io_end(io_end);
+
+ return 0;
}
/*
@@ -3243,7 +3347,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
- ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3268,16 +3371,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
/*
* We could direct write to holes and fallocate.
*
- * Allocated blocks to fill the hole are marked as
- * unwritten to prevent parallel buffered read to expose
- * the stale data before DIO complete the data IO.
+ * Allocated blocks to fill the hole are marked as unwritten to prevent
+ * parallel buffered read to expose the stale data before DIO complete
+ * the data IO.
*
- * As to previously fallocated extents, ext4 get_block will
- * just simply mark the buffer mapped but still keep the
- * extents unwritten.
+ * As to previously fallocated extents, ext4 get_block will just simply
+ * mark the buffer mapped but still keep the extents unwritten.
*
- * For non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
+ * For non AIO case, we will convert those unwritten extents to written
+ * after return back from blockdev_direct_IO. That way we save us from
+ * allocating io_end structure and also the overhead of offloading
+ * the extent convertion to a workqueue.
*
* For async DIO, the conversion needs to be deferred when the
* IO is completed. The ext4 end_io callback function will be
@@ -3285,30 +3389,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* case, we allocate an io_end structure to hook to the iocb.
*/
iocb->private = NULL;
- if (overwrite) {
- get_block_func = ext4_get_block_overwrite;
+ if (overwrite)
+ get_block_func = ext4_dio_get_block_overwrite;
+ else if (is_sync_kiocb(iocb)) {
+ get_block_func = ext4_dio_get_block_unwritten_sync;
+ dio_flags = DIO_LOCKING;
} else {
- ext4_inode_aio_set(inode, NULL);
- if (!is_sync_kiocb(iocb)) {
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end) {
- ret = -ENOMEM;
- goto retake_lock;
- }
- /*
- * Grab reference for DIO. Will be dropped in
- * ext4_end_io_dio()
- */
- iocb->private = ext4_get_io_end(io_end);
- /*
- * we save the io structure for current async direct
- * IO, so that later ext4_map_blocks() could flag the
- * io structure whether there is a unwritten extents
- * needs to be converted when IO is completed.
- */
- ext4_inode_aio_set(inode, io_end);
- }
- get_block_func = ext4_get_block_write;
+ get_block_func = ext4_dio_get_block_unwritten_async;
dio_flags = DIO_LOCKING;
}
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3323,27 +3410,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_func,
ext4_end_io_dio, NULL, dio_flags);
- /*
- * Put our reference to io_end. This can free the io_end structure e.g.
- * in sync IO case or in case of error. It can even perform extent
- * conversion if all bios we submitted finished before we got here.
- * Note that in that case iocb->private can be already set to NULL
- * here.
- */
- if (io_end) {
- ext4_inode_aio_set(inode, NULL);
- ext4_put_io_end(io_end);
- /*
- * When no IO was submitted ext4_end_io_dio() was not
- * called so we have to put iocb's reference.
- */
- if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
- WARN_ON(iocb->private != io_end);
- WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- ext4_put_io_end(io_end);
- iocb->private = NULL;
- }
- }
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
@@ -3358,7 +3424,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
-retake_lock:
if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
@@ -5261,6 +5326,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
might_sleep();
trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ return err;
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5291,9 +5358,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
}
}
}
- if (!err)
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- return err;
+ return ext4_mark_iloc_dirty(handle, inode, &iloc);
}
/*
@@ -5502,7 +5567,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unlock_page(page);
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
- get_block = ext4_get_block_write;
+ get_block = ext4_get_block_unwritten;
else
get_block = ext4_get_block;
retry_alloc:
@@ -5545,3 +5610,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return err;
}
+
+/*
+ * Find the first extent at or after @lblk in an inode that is not a hole.
+ * Search for @map_len blocks at most. The extent is returned in @result.
+ *
+ * The function returns 1 if we found an extent. The function returns 0 in
+ * case there is no extent at or after @lblk and in that case also sets
+ * @result->es_len to 0. In case of error, the error code is returned.
+ */
+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int map_len, struct extent_status *result)
+{
+ struct ext4_map_blocks map;
+ struct extent_status es = {};
+ int ret;
+
+ map.m_lblk = lblk;
+ map.m_len = map_len;
+
+ /*
+ * For non-extent based files this loop may iterate several times since
+ * we do not determine full hole size.
+ */
+ while (map.m_len > 0) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ /* There's extent covering m_lblk? Just return it. */
+ if (ret > 0) {
+ int status;
+
+ ext4_es_store_pblock(result, map.m_pblk);
+ result->es_lblk = map.m_lblk;
+ result->es_len = map.m_len;
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ status = EXTENT_STATUS_UNWRITTEN;
+ else
+ status = EXTENT_STATUS_WRITTEN;
+ ext4_es_store_status(result, status);
+ return 1;
+ }
+ ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+ map.m_lblk + map.m_len - 1,
+ &es);
+ /* Is delalloc data before next block in extent tree? */
+ if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+ ext4_lblk_t offset = 0;
+
+ if (es.es_lblk < lblk)
+ offset = lblk - es.es_lblk;
+ result->es_lblk = es.es_lblk + offset;
+ ext4_es_store_pblock(result,
+ ext4_es_pblock(&es) + offset);
+ result->es_len = es.es_len - offset;
+ ext4_es_store_status(result, ext4_es_status(&es));
+
+ return 1;
+ }
+ /* There's a hole at m_lblk, advance us after it */
+ map.m_lblk += map.m_len;
+ map_len -= map.m_len;
+ map.m_len = map_len;
+ cond_resched();
+ }
+ result->es_len = 0;
+ return 0;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4424b7bf8ac6..50e05df28f66 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* for this page; do not hold this lock when calling this routine!
*/
-static int ext4_mb_init_cache(struct page *page, char *incore)
+static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
int blocksize;
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
i = sizeof(struct buffer_head *) * groups_per_page;
- bh = kzalloc(i, GFP_NOFS);
+ bh = kzalloc(i, gfp);
if (bh == NULL) {
err = -ENOMEM;
goto out;
@@ -983,7 +983,7 @@ out:
* are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
int block, pnum, poff;
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block++;
pnum = block / blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
* calling this routine!
*/
static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{
struct ext4_group_info *this_grp;
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* The call to ext4_mb_get_buddy_page_lock will mark the
* page accessed.
*/
- ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
}
page = e4b.bd_bitmap_page;
- ret = ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret)
goto err;
if (!PageUptodate(page)) {
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
}
/* init buddy cache */
page = e4b.bd_buddy_page;
- ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
if (ret)
goto err;
if (!PageUptodate(page)) {
@@ -1109,8 +1109,8 @@ err:
* calling this routine!
*/
static noinline_for_stack int
-ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
- struct ext4_buddy *e4b)
+ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b, gfp_t gfp)
{
int blocks_per_page;
int block;
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* we need full data about the group
* to make a good selection
*/
- ret = ext4_mb_init_group(sb, group);
+ ret = ext4_mb_init_group(sb, group, gfp);
if (ret)
return ret;
}
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* wait for it to initialize.
*/
page_cache_release(page);
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret) {
unlock_page(page);
goto err;
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+ gfp);
if (ret) {
unlock_page(page);
goto err;
@@ -1247,6 +1248,12 @@ err:
return ret;
}
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+{
+ return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
+}
+
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group);
+ int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
}
/*
- * We need to make sure we don't reuse the freed block until
- * after the transaction is committed, which we can do by
- * treating the block as metadata, below. We make an
- * exception if the inode is to be written in writeback mode
- * since writeback mode has weak data consistency guarantees.
- */
- if (!ext4_should_writeback_data(inode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
-
- /*
* If the extent to be freed does not begin on a cluster
* boundary, we need to deal with partial clusters at the
* beginning and end of the extent. Normally we will free
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
for (i = 0; i < count; i++) {
cond_resched();
- bh = sb_find_get_block(inode->i_sb, block + i);
- if (!bh)
- continue;
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block + i);
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
@@ -4815,16 +4811,23 @@ do_more:
#endif
trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
if (err)
goto error_return;
- if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+ /*
+ * We need to make sure we don't reuse the freed block until after the
+ * transaction is committed. We make an exception if the inode is to be
+ * written in writeback mode since writeback mode has weak data
+ * consistency guarantees.
+ */
+ if (ext4_handle_valid(handle) &&
+ ((flags & EXT4_FREE_BLOCKS_METADATA) ||
+ !ext4_should_writeback_data(inode))) {
struct ext4_free_data *new_entry;
/*
- * blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed
- *
* We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
* to fail.
*/
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
grp = ext4_get_group_info(sb, group);
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(sb, group);
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
break;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d634e183b4d4..3ef1df6ae9ec 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,18 +23,6 @@
#include "ext4.h"
/*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-
-/*
*/
#ifdef CONFIG_EXT4_DEBUG
extern ushort ext4_mballoc_debug;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a4651894cc33..364ea4d4a943 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* blocks.
*
* While converting to extents we need not
- * update the orignal inode i_blocks for extent blocks
+ * update the original inode i_blocks for extent blocks
* via quota APIs. The quota update happened via tmp_inode already.
*/
spin_lock(&inode->i_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0a512aa81bf7..24445275d330 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
- brelse(*bh);
- *bh = NULL;
ret = -EIO;
goto warn_exit;
}
-
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
- else if (!ext4_mmp_csum_verify(sb, mmp))
+ goto warn_exit;
+ }
+ if (!ext4_mmp_csum_verify(sb, mmp)) {
ret = -EFSBADCRC;
- else
- return 0;
-
+ goto warn_exit;
+ }
+ return 0;
warn_exit:
+ brelse(*bh);
+ *bh = NULL;
ext4_warning(sb, "Error %d while reading MMP block %llu",
ret, mmp_block);
return ret;
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
EXT4_FEATURE_INCOMPAT_MMP)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
if (sb->s_flags & MS_RDONLY) {
ext4_warning(sb, "kmmpd being stopped since filesystem "
"has been remounted as readonly.");
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
diff = jiffies - last_update_time;
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
if (retval) {
ext4_error(sb, "error reading MMP data: %d",
retval);
-
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
"The filesystem seems to have been"
" multiply mounted.");
ext4_error(sb, "abort");
- goto failed;
+ put_bh(bh_check);
+ retval = -EBUSY;
+ goto exit_thread;
}
put_bh(bh_check);
}
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
-failed:
+exit_thread:
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
kfree(data);
brelse(bh);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e032a0423e35..4098acc701c3 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -390,6 +390,7 @@ data_copy:
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
if (*err < 0)
break;
+ bh = bh->b_this_page;
}
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 090b3498638e..d77d15f4b674 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
WARN_ON(io_end->handle);
- if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io_end->inode));
-
for (bio = io_end->bio; bio; bio = next_bio) {
next_bio = bio->bi_private;
ext4_finish_bio(bio);
@@ -139,16 +136,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
kmem_cache_free(io_end_cachep, io_end);
}
-static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
-{
- struct inode *inode = io_end->inode;
-
- io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
-}
-
/*
* Check a range of space and convert unwritten extents to written. Note that
* we are protected from truncate touching same part of extent tree by the
@@ -265,7 +252,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
if (io) {
- atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
atomic_set(&io->count, 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ed01ec011d7..539297515896 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
-static int ext4_mballoc_ready;
static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
- ext4_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
ext4_clear_feature_journal_needs_recovery(sb);
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -1132,6 +1129,7 @@ static const struct dquot_operations ext4_quota_operations = {
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
.get_projid = ext4_get_projid,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -1141,7 +1139,8 @@ static const struct quotactl_ops ext4_qctl_operations = {
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
};
#endif
@@ -1425,9 +1424,9 @@ static const struct mount_opts {
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2 | MOPT_SET},
+ MOPT_NO_EXT2},
{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2 | MOPT_CLEAR},
+ MOPT_NO_EXT2},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1705,6 +1704,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
#endif
+ } else if (token == Opt_data_err_abort) {
+ sbi->s_mount_opt |= m->mount_opt;
+ } else if (token == Opt_data_err_ignore) {
+ sbi->s_mount_opt &= ~m->mount_opt;
} else {
if (!args->from)
arg = 1;
@@ -1914,6 +1917,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+ if (test_opt(sb, DATA_ERR_ABORT))
+ SEQ_OPTS_PUTS("data_err=abort");
ext4_show_quota_options(seq, sb);
return 0;
@@ -3796,12 +3801,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
no_journal:
- if (ext4_mballoc_ready) {
- sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
- if (!sbi->s_mb_cache) {
- ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
- goto failed_mount_wq;
- }
+ sbi->s_mb_cache = ext4_xattr_create_cache();
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
}
if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4027,6 +4030,10 @@ failed_mount4:
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -5321,7 +5328,6 @@ MODULE_ALIAS_FS("ext4");
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
static int __init ext4_init_fs(void)
{
@@ -5334,10 +5340,8 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
- mutex_init(&ext4__aio_mutex[i]);
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
init_waitqueue_head(&ext4__ioend_wq[i]);
- }
err = ext4_init_es();
if (err)
@@ -5358,8 +5362,6 @@ static int __init ext4_init_fs(void)
err = ext4_init_mballoc();
if (err)
goto out2;
- else
- ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
@@ -5375,7 +5377,6 @@ out:
unregister_as_ext3();
destroy_inodecache();
out1:
- ext4_mballoc_ready = 0;
ext4_exit_mballoc();
out2:
ext4_exit_sysfs();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a95151e875bd..0441e055c8e8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -545,30 +545,44 @@ static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
- struct mb_cache_entry *ce = NULL;
- int error = 0;
struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ u32 hash, ref;
+ int error = 0;
- ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
lock_buffer(bh);
- if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+ hash = le32_to_cpu(BHDR(bh)->h_hash);
+ ref = le32_to_cpu(BHDR(bh)->h_refcount);
+ if (ref == 1) {
ea_bdebug(bh, "refcount now=0; freeing");
- if (ce)
- mb_cache_entry_free(ce);
+ /*
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect freed block
+ */
+ mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
get_bh(bh);
unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
} else {
- le32_add_cpu(&BHDR(bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
+ ref--;
+ BHDR(bh)->h_refcount = cpu_to_le32(ref);
+ if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
+ struct mb_cache_entry *ce;
+
+ ce = mb_cache_entry_get(ext4_mb_cache, hash,
+ bh->b_blocknr);
+ if (ce) {
+ ce->e_reusable = 1;
+ mb_cache_entry_put(ext4_mb_cache, ce);
+ }
+ }
+
/*
* Beware of this ugliness: Releasing of xattr block references
* from different inodes can race and so we have to protect
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
- bs->bh->b_blocknr);
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
lock_buffer(bs->bh);
if (header(s->base)->h_refcount == cpu_to_le32(1)) {
- if (ce) {
- mb_cache_entry_free(ce);
- ce = NULL;
- }
+ __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect modified
+ * block
+ */
+ mb_cache_entry_delete_block(ext4_mb_cache, hash,
+ bs->bh->b_blocknr);
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s);
if (!error) {
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- if (ce) {
- mb_cache_entry_release(ce);
- ce = NULL;
- }
ea_bdebug(bs->bh, "cloning");
s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
@@ -872,6 +885,8 @@ inserted:
if (new_bh == bs->bh)
ea_bdebug(new_bh, "keeping");
else {
+ u32 ref;
+
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -884,9 +899,40 @@ inserted:
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
- le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+ /*
+ * We have to be careful about races with
+ * freeing, rehashing or adding references to
+ * xattr block. Once we hold buffer lock xattr
+ * block's state is stable so we can check
+ * whether the block got freed / rehashed or
+ * not. Since we unhash mbcache entry under
+ * buffer lock when freeing / rehashing xattr
+ * block, checking whether entry is still
+ * hashed is reliable. Same rules hold for
+ * e_reusable handling.
+ */
+ if (hlist_bl_unhashed(&ce->e_hash_list) ||
+ !ce->e_reusable) {
+ /*
+ * Undo everything and check mbcache
+ * again.
+ */
+ unlock_buffer(new_bh);
+ dquot_free_block(inode,
+ EXT4_C2B(EXT4_SB(sb),
+ 1));
+ brelse(new_bh);
+ mb_cache_entry_put(ext4_mb_cache, ce);
+ ce = NULL;
+ new_bh = NULL;
+ goto inserted;
+ }
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
+ if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+ ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d",
- le32_to_cpu(BHDR(new_bh)->h_refcount));
+ ref);
unlock_buffer(new_bh);
error = ext4_handle_dirty_xattr_block(handle,
inode,
@@ -894,7 +940,8 @@ inserted:
if (error)
goto cleanup_dquot;
}
- mb_cache_entry_release(ce);
+ mb_cache_entry_touch(ext4_mb_cache, ce);
+ mb_cache_entry_put(ext4_mb_cache, ce);
ce = NULL;
} else if (bs->bh && s->base == bs->bh->b_data) {
/* We were modifying this block in-place. */
@@ -959,7 +1006,7 @@ getblk_failed:
cleanup:
if (ce)
- mb_cache_entry_release(ce);
+ mb_cache_entry_put(ext4_mb_cache, ce);
brelse(new_bh);
if (!(bs->bh && s->base == bs->bh->b_data))
kfree(s->base);
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
return 0;
}
+static int ext4_xattr_value_same(struct ext4_xattr_search *s,
+ struct ext4_xattr_info *i)
+{
+ void *value;
+
+ if (le32_to_cpu(s->here->e_value_size) != i->value_len)
+ return 0;
+ value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
+ return !memcmp(value, i->value, i->value_len);
+}
+
/*
* ext4_xattr_set_handle()
*
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
else if (!bs.s.not_found)
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else {
+ error = 0;
+ /* Xattr value did not change? Save us some work and bail out */
+ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
+ goto cleanup;
+ if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
+ goto cleanup;
+
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (!error && !bs.s.not_found) {
i.value = NULL;
@@ -1512,17 +1577,6 @@ cleanup:
}
/*
- * ext4_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext4_xattr_put_super(struct super_block *sb)
-{
- mb_cache_shrink(sb->s_bdev);
-}
-
-/*
* ext4_xattr_cache_insert()
*
* Create a new entry in the extended attribute cache, and insert
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb)
static void
ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
- __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
- struct mb_cache_entry *ce;
+ struct ext4_xattr_header *header = BHDR(bh);
+ __u32 hash = le32_to_cpu(header->h_hash);
+ int reusable = le32_to_cpu(header->h_refcount) <
+ EXT4_XATTR_REFCOUNT_MAX;
int error;
- ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
- if (!ce) {
- ea_bdebug(bh, "out of memory");
- return;
- }
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+ error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
+ bh->b_blocknr, reusable);
if (error) {
- mb_cache_entry_free(ce);
- if (error == -EBUSY) {
+ if (error == -EBUSY)
ea_bdebug(bh, "already in cache");
- error = 0;
- }
- } else {
+ } else
ea_bdebug(bh, "inserting [%x]", (int)hash);
- mb_cache_entry_release(ce);
- }
}
/*
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
- ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
- hash);
+ ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
while (ce) {
struct buffer_head *bh;
- if (IS_ERR(ce)) {
- if (PTR_ERR(ce) == -EAGAIN)
- goto again;
- break;
- }
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long) ce->e_block);
- } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
- EXT4_XATTR_REFCOUNT_MAX) {
- ea_idebug(inode, "block %lu refcount %d>=%d",
- (unsigned long) ce->e_block,
- le32_to_cpu(BHDR(bh)->h_refcount),
- EXT4_XATTR_REFCOUNT_MAX);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
}
return NULL;
}
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#define HASH_BUCKET_BITS 10
struct mb_cache *
-ext4_xattr_create_cache(char *name)
+ext4_xattr_create_cache(void)
{
- return mb_cache_create(name, HASH_BUCKET_BITS);
+ return mb_cache_create(HASH_BUCKET_BITS);
}
void ext4_xattr_destroy_cache(struct mb_cache *cache)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..69dd3e6566e0 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
-extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
#ifdef CONFIG_EXT4_FS_SECURITY
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index b0a9dc929f88..1f8982a957f1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,6 +1,8 @@
config F2FS_FS
tristate "F2FS filesystem support"
depends on BLOCK
+ select CRYPTO
+ select CRYPTO_CRC32
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -76,15 +78,7 @@ config F2FS_FS_ENCRYPTION
bool "F2FS Encryption"
depends on F2FS_FS
depends on F2FS_FS_XATTR
- select CRYPTO_AES
- select CRYPTO_CBC
- select CRYPTO_ECB
- select CRYPTO_XTS
- select CRYPTO_CTS
- select CRYPTO_CTR
- select CRYPTO_SHA256
- select KEYS
- select ENCRYPTED_KEYS
+ select FS_ENCRYPTION
help
Enable encryption of f2fs files and directories. This
feature is similar to ecryptfs, but it is more memory
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 08e101ed914c..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
-f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
- crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 3842af954cd5..0955312e5ca0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -39,7 +39,7 @@ repeat:
cond_resched();
goto repeat;
}
- f2fs_wait_on_page_writeback(page, META);
+ f2fs_wait_on_page_writeback(page, META, true);
SetPageUptodate(page);
return page;
}
@@ -56,7 +56,8 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.sbi = sbi,
.type = META,
.rw = READ_SYNC | REQ_META | REQ_PRIO,
- .blk_addr = index,
+ .old_blkaddr = index,
+ .new_blkaddr = index,
.encrypted_page = NULL,
};
@@ -143,7 +144,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
{
- block_t prev_blk_addr = 0;
struct page *page;
block_t blkno = start;
struct f2fs_io_info fio = {
@@ -152,10 +152,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
.encrypted_page = NULL,
};
+ struct blk_plug plug;
if (unlikely(type == META_POR))
fio.rw &= ~REQ_META;
+ blk_start_plug(&plug);
for (; nrpages-- > 0; blkno++) {
if (!is_valid_blkaddr(sbi, blkno, type))
@@ -167,27 +169,24 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
blkno = 0;
/* get nat block addr */
- fio.blk_addr = current_nat_addr(sbi,
+ fio.new_blkaddr = current_nat_addr(sbi,
blkno * NAT_ENTRY_PER_BLOCK);
break;
case META_SIT:
/* get sit block addr */
- fio.blk_addr = current_sit_addr(sbi,
+ fio.new_blkaddr = current_sit_addr(sbi,
blkno * SIT_ENTRY_PER_BLOCK);
- if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
- goto out;
- prev_blk_addr = fio.blk_addr;
break;
case META_SSA:
case META_CP:
case META_POR:
- fio.blk_addr = blkno;
+ fio.new_blkaddr = blkno;
break;
default:
BUG();
}
- page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
+ page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr);
if (!page)
continue;
if (PageUptodate(page)) {
@@ -196,11 +195,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
}
fio.page = page;
+ fio.old_blkaddr = fio.new_blkaddr;
f2fs_submit_page_mbio(&fio);
f2fs_put_page(page, 0);
}
out:
f2fs_submit_merged_bio(sbi, META, READ);
+ blk_finish_plug(&plug);
return blkno - start;
}
@@ -232,13 +233,17 @@ static int f2fs_write_meta_page(struct page *page,
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
dec_page_count(sbi, F2FS_DIRTY_META);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
+
unlock_page(page);
- if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, META, WRITE);
+
return 0;
redirty_out:
@@ -252,13 +257,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
- trace_f2fs_writepages(mapping->host, wbc, META);
-
/* collect a number of dirty meta pages and write together */
if (wbc->for_kupdate ||
get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, META);
+
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
diff = nr_pages_to_write(sbi, META, wbc);
@@ -269,6 +274,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
skip_write:
wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
+ trace_f2fs_writepages(mapping->host, wbc, META);
return 0;
}
@@ -276,15 +282,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
+ pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
struct pagevec pvec;
long nwritten = 0;
struct writeback_control wbc = {
.for_reclaim = 0,
};
+ struct blk_plug plug;
pagevec_init(&pvec, 0);
+ blk_start_plug(&plug);
+
while (index <= end) {
int i, nr_pages;
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -296,7 +305,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (prev == LONG_MAX)
+ if (prev == ULONG_MAX)
prev = page->index - 1;
if (nr_to_write != LONG_MAX && page->index != prev + 1) {
pagevec_release(&pvec);
@@ -315,6 +324,9 @@ continue_unlock:
goto continue_unlock;
}
+ f2fs_wait_on_page_writeback(page, META, true);
+
+ BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -334,6 +346,8 @@ stop:
if (nwritten)
f2fs_submit_merged_bio(sbi, type, WRITE);
+ blk_finish_plug(&plug);
+
return nwritten;
}
@@ -621,7 +635,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
goto invalid_cp1;
crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
goto invalid_cp1;
pre_version = cur_cp_version(cp_block);
@@ -636,7 +650,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
goto invalid_cp2;
crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
goto invalid_cp2;
cur_version = cur_cp_version(cp_block);
@@ -696,6 +710,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
memcpy(sbi->ckpt, cp_block, blk_size);
+ /* Sanity checking of checkpoint */
+ if (sanity_check_ckpt(sbi))
+ goto fail_no_cp;
+
if (cp_blks <= 1)
goto done;
@@ -902,7 +920,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
if (!get_pages(sbi, F2FS_WRITEBACK))
break;
- io_schedule();
+ io_schedule_timeout(5*HZ);
}
finish_wait(&sbi->cp_wait, &wait);
}
@@ -921,6 +939,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
int cp_payload_blks = __cp_payload(sbi);
block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
bool invalidate = false;
+ struct super_block *sb = sbi->sb;
+ struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+ u64 kbytes_written;
/*
* This avoids to conduct wrong roll-forward operations and uses
@@ -1008,7 +1029,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
- crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+ crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
*((__le32 *)((unsigned char *)ckpt +
le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
@@ -1034,6 +1055,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
+
+ /* Record write statistics in the hot node summary */
+ kbytes_written = sbi->kbytes_written;
+ if (sb->s_bdev->bd_part)
+ kbytes_written += BD_PART_WRITTEN(sbi);
+
+ seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
+
if (__remain_node_summaries(cpc->reason)) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
@@ -1048,8 +1077,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
- filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
+ filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
+ filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -1112,9 +1141,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- f2fs_submit_merged_bio(sbi, META, WRITE);
+ f2fs_flush_merged_bios(sbi);
/*
* update checkpoint pack index
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
deleted file mode 100644
index 4a62ef14e932..000000000000
--- a/fs/f2fs/crypto.c
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * linux/fs/f2fs/crypto.c
- *
- * Copied from linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- *
- * This contains encryption functions for f2fs
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- * Remove ext4_encrypted_zeroout(),
- * add f2fs_restore_and_release_control_page()
- * Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
- */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-#include <keys/user-type.h>
-#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
-#include <linux/ecryptfs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
-#include <linux/ratelimit.h>
-#include <linux/bio.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-/* Encryption added and removed here! (L: */
-
-static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
-
-module_param(num_prealloc_crypto_pages, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_pages,
- "Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
- "Number of crypto contexts to preallocate");
-
-static mempool_t *f2fs_bounce_page_pool;
-
-static LIST_HEAD(f2fs_free_crypto_ctxs);
-static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
-
-static struct workqueue_struct *f2fs_read_workqueue;
-static DEFINE_MUTEX(crypto_init);
-
-static struct kmem_cache *f2fs_crypto_ctx_cachep;
-struct kmem_cache *f2fs_crypt_info_cachep;
-
-/**
- * f2fs_release_crypto_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
- *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
- */
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
-{
- unsigned long flags;
-
- if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
- mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
- ctx->w.bounce_page = NULL;
- }
- ctx->w.control_page = NULL;
- if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
- kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
- } else {
- spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
- list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
- spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
- }
-}
-
-/**
- * f2fs_get_crypto_ctx() - Gets an encryption context
- * @inode: The inode for which we are doing the crypto
- *
- * Allocates and initializes an encryption context.
- *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
- */
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
-{
- struct f2fs_crypto_ctx *ctx = NULL;
- unsigned long flags;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
- if (ci == NULL)
- return ERR_PTR(-ENOKEY);
-
- /*
- * We first try getting the ctx from a free list because in
- * the common case the ctx will have an allocated and
- * initialized crypto tfm, so it's probably a worthwhile
- * optimization. For the bounce page, we first try getting it
- * from the kernel allocator because that's just about as fast
- * as getting it from a list and because a cache of free pages
- * should generally be a "last resort" option for a filesystem
- * to be able to do its job.
- */
- spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
- ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
- struct f2fs_crypto_ctx, free_list);
- if (ctx)
- list_del(&ctx->free_list);
- spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
- if (!ctx) {
- ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
- ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
- }
- ctx->flags &= ~F2FS_WRITE_PATH_FL;
- return ctx;
-}
-
-/*
- * Call f2fs_decrypt on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
- struct f2fs_crypto_ctx *ctx =
- container_of(work, struct f2fs_crypto_ctx, r.work);
- struct bio *bio = ctx->r.bio;
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
- int ret = f2fs_decrypt(ctx, page);
-
- if (ret) {
- WARN_ON_ONCE(1);
- SetPageError(page);
- } else
- SetPageUptodate(page);
- unlock_page(page);
- }
- f2fs_release_crypto_ctx(ctx);
- bio_put(bio);
-}
-
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
-{
- INIT_WORK(&ctx->r.work, completion_pages);
- ctx->r.bio = bio;
- queue_work(f2fs_read_workqueue, &ctx->r.work);
-}
-
-static void f2fs_crypto_destroy(void)
-{
- struct f2fs_crypto_ctx *pos, *n;
-
- list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
- kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
- INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
- if (f2fs_bounce_page_pool)
- mempool_destroy(f2fs_bounce_page_pool);
- f2fs_bounce_page_pool = NULL;
-}
-
-/**
- * f2fs_crypto_initialize() - Set up for f2fs encryption.
- *
- * We only call this when we start accessing encrypted files, since it
- * results in memory getting allocated that wouldn't otherwise be used.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_crypto_initialize(void)
-{
- int i, res = -ENOMEM;
-
- if (f2fs_bounce_page_pool)
- return 0;
-
- mutex_lock(&crypto_init);
- if (f2fs_bounce_page_pool)
- goto already_initialized;
-
- for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
- struct f2fs_crypto_ctx *ctx;
-
- ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
- if (!ctx)
- goto fail;
- list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
- }
-
- /* must be allocated at the last step to avoid race condition above */
- f2fs_bounce_page_pool =
- mempool_create_page_pool(num_prealloc_crypto_pages, 0);
- if (!f2fs_bounce_page_pool)
- goto fail;
-
-already_initialized:
- mutex_unlock(&crypto_init);
- return 0;
-fail:
- f2fs_crypto_destroy();
- mutex_unlock(&crypto_init);
- return res;
-}
-
-/**
- * f2fs_exit_crypto() - Shutdown the f2fs encryption system
- */
-void f2fs_exit_crypto(void)
-{
- f2fs_crypto_destroy();
-
- if (f2fs_read_workqueue)
- destroy_workqueue(f2fs_read_workqueue);
- if (f2fs_crypto_ctx_cachep)
- kmem_cache_destroy(f2fs_crypto_ctx_cachep);
- if (f2fs_crypt_info_cachep)
- kmem_cache_destroy(f2fs_crypt_info_cachep);
-}
-
-int __init f2fs_init_crypto(void)
-{
- int res = -ENOMEM;
-
- f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
- if (!f2fs_read_workqueue)
- goto fail;
-
- f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
- SLAB_RECLAIM_ACCOUNT);
- if (!f2fs_crypto_ctx_cachep)
- goto fail;
-
- f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
- SLAB_RECLAIM_ACCOUNT);
- if (!f2fs_crypt_info_cachep)
- goto fail;
-
- return 0;
-fail:
- f2fs_exit_crypto();
- return res;
-}
-
-void f2fs_restore_and_release_control_page(struct page **page)
-{
- struct f2fs_crypto_ctx *ctx;
- struct page *bounce_page;
-
- /* The bounce data pages are unmapped. */
- if ((*page)->mapping)
- return;
-
- /* The bounce data page is unmapped. */
- bounce_page = *page;
- ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
-
- /* restore control page */
- *page = ctx->w.control_page;
-
- f2fs_restore_control_page(bounce_page);
-}
-
-void f2fs_restore_control_page(struct page *data_page)
-{
- struct f2fs_crypto_ctx *ctx =
- (struct f2fs_crypto_ctx *)page_private(data_page);
-
- set_page_private(data_page, (unsigned long)NULL);
- ClearPagePrivate(data_page);
- unlock_page(data_page);
- f2fs_release_crypto_ctx(ctx);
-}
-
-/**
- * f2fs_crypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
- */
-static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
-{
- struct f2fs_completion_result *ecr = req->data;
-
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
-
-typedef enum {
- F2FS_DECRYPT = 0,
- F2FS_ENCRYPT,
-} f2fs_direction_t;
-
-static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
- struct inode *inode,
- f2fs_direction_t rw,
- pgoff_t index,
- struct page *src_page,
- struct page *dest_page)
-{
- u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
- struct scatterlist dst, src;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
- int res = 0;
-
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n",
- __func__);
- return -ENOMEM;
- }
- ablkcipher_request_set_callback(
- req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- f2fs_crypt_complete, &ecr);
-
- BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
- memcpy(xts_tweak, &index, sizeof(index));
- memset(&xts_tweak[sizeof(index)], 0,
- F2FS_XTS_TWEAK_SIZE - sizeof(index));
-
- sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
- sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
- ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
- xts_tweak);
- if (rw == F2FS_DECRYPT)
- res = crypto_ablkcipher_decrypt(req);
- else
- res = crypto_ablkcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
- ablkcipher_request_free(req);
- if (res) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_ablkcipher_encrypt() returned %d\n",
- __func__, res);
- return res;
- }
- return 0;
-}
-
-static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
-{
- ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
- if (ctx->w.bounce_page == NULL)
- return ERR_PTR(-ENOMEM);
- ctx->flags |= F2FS_WRITE_PATH_FL;
- return ctx->w.bounce_page;
-}
-
-/**
- * f2fs_encrypt() - Encrypts a page
- * @inode: The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
- *
- * Called on the page write path. The caller must call
- * f2fs_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * Return: An allocated page with the encrypted content on success. Else, an
- * error value or NULL.
- */
-struct page *f2fs_encrypt(struct inode *inode,
- struct page *plaintext_page)
-{
- struct f2fs_crypto_ctx *ctx;
- struct page *ciphertext_page = NULL;
- int err;
-
- BUG_ON(!PageLocked(plaintext_page));
-
- ctx = f2fs_get_crypto_ctx(inode);
- if (IS_ERR(ctx))
- return (struct page *)ctx;
-
- /* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_bounce_page(ctx);
- if (IS_ERR(ciphertext_page))
- goto err_out;
-
- ctx->w.control_page = plaintext_page;
- err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page);
- if (err) {
- ciphertext_page = ERR_PTR(err);
- goto err_out;
- }
-
- SetPagePrivate(ciphertext_page);
- set_page_private(ciphertext_page, (unsigned long)ctx);
- lock_page(ciphertext_page);
- return ciphertext_page;
-
-err_out:
- f2fs_release_crypto_ctx(ctx);
- return ciphertext_page;
-}
-
-/**
- * f2fs_decrypt() - Decrypts a page in-place
- * @ctx: The encryption context.
- * @page: The page to decrypt. Must be locked.
- *
- * Decrypts page in-place using the ctx encryption context.
- *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
-{
- BUG_ON(!PageLocked(page));
-
- return f2fs_page_crypto(ctx, page->mapping->host,
- F2FS_DECRYPT, page->index, page, page);
-}
-
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int f2fs_decrypt_one(struct inode *inode, struct page *page)
-{
- struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
- int ret;
-
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
- ret = f2fs_decrypt(ctx, page);
- f2fs_release_crypto_ctx(ctx);
- return ret;
-}
-
-bool f2fs_valid_contents_enc_mode(uint32_t mode)
-{
- return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-/**
- * f2fs_validate_encryption_key_size() - Validate the encryption key size
- * @mode: The key mode.
- * @size: The key size to validate.
- *
- * Return: The validated key size for @mode. Zero if invalid.
- */
-uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
-{
- if (size == f2fs_encryption_key_size(mode))
- return size;
- return 0;
-}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
deleted file mode 100644
index 5de2d866a25c..000000000000
--- a/fs/f2fs/crypto_key.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * linux/fs/f2fs/crypto_key.c
- *
- * Copied from linux/fs/f2fs/crypto_key.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions for f2fs
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
-#include <crypto/hash.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
- struct f2fs_completion_result *ecr = req->data;
-
- if (rc == -EINPROGRESS)
- return;
-
- ecr->res = rc;
- complete(&ecr->completion);
-}
-
-/**
- * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivatio.
- * @source_key: Source key to which to apply derivation.
- * @derived_key: Derived key.
- *
- * Return: Zero on success; non-zero otherwise.
- */
-static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
- char source_key[F2FS_AES_256_XTS_KEY_SIZE],
- char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
-{
- int res = 0;
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
- struct scatterlist src_sg, dst_sg;
- struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
- 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- ablkcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- derive_crypt_complete, &ecr);
- res = crypto_ablkcipher_setkey(tfm, deriving_key,
- F2FS_AES_128_ECB_KEY_SIZE);
- if (res < 0)
- goto out;
-
- sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
- sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
- F2FS_AES_256_XTS_KEY_SIZE, NULL);
- res = crypto_ablkcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
-out:
- if (req)
- ablkcipher_request_free(req);
- if (tfm)
- crypto_free_ablkcipher(tfm);
- return res;
-}
-
-static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
-{
- if (!ci)
- return;
-
- key_put(ci->ci_keyring_key);
- crypto_free_ablkcipher(ci->ci_ctfm);
- kmem_cache_free(f2fs_crypt_info_cachep, ci);
-}
-
-void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_crypt_info *prev;
-
- if (ci == NULL)
- ci = ACCESS_ONCE(fi->i_crypt_info);
- if (ci == NULL)
- return;
- prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
- if (prev != ci)
- return;
-
- f2fs_free_crypt_info(ci);
-}
-
-int _f2fs_get_encryption_info(struct inode *inode)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_crypt_info *crypt_info;
- char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
- (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
- struct key *keyring_key = NULL;
- struct f2fs_encryption_key *master_key;
- struct f2fs_encryption_context ctx;
- const struct user_key_payload *ukp;
- struct crypto_ablkcipher *ctfm;
- const char *cipher_str;
- char raw_key[F2FS_MAX_KEY_SIZE];
- char mode;
- int res;
-
- res = f2fs_crypto_initialize();
- if (res)
- return res;
-retry:
- crypt_info = ACCESS_ONCE(fi->i_crypt_info);
- if (crypt_info) {
- if (!crypt_info->ci_keyring_key ||
- key_validate(crypt_info->ci_keyring_key) == 0)
- return 0;
- f2fs_free_encryption_info(inode, crypt_info);
- goto retry;
- }
-
- res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
- &ctx, sizeof(ctx), NULL);
- if (res < 0)
- return res;
- else if (res != sizeof(ctx))
- return -EINVAL;
- res = 0;
-
- crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
- if (!crypt_info)
- return -ENOMEM;
-
- crypt_info->ci_flags = ctx.flags;
- crypt_info->ci_data_mode = ctx.contents_encryption_mode;
- crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
- crypt_info->ci_ctfm = NULL;
- crypt_info->ci_keyring_key = NULL;
- memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
- sizeof(crypt_info->ci_master_key));
- if (S_ISREG(inode->i_mode))
- mode = crypt_info->ci_data_mode;
- else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- mode = crypt_info->ci_filename_mode;
- else
- BUG();
-
- switch (mode) {
- case F2FS_ENCRYPTION_MODE_AES_256_XTS:
- cipher_str = "xts(aes)";
- break;
- case F2FS_ENCRYPTION_MODE_AES_256_CTS:
- cipher_str = "cts(cbc(aes))";
- break;
- default:
- printk_once(KERN_WARNING
- "f2fs: unsupported key mode %d (ino %u)\n",
- mode, (unsigned) inode->i_ino);
- res = -ENOKEY;
- goto out;
- }
-
- memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
- F2FS_KEY_DESC_PREFIX_SIZE);
- sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
- "%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
- ctx.master_key_descriptor);
- full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
- (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
- keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
- if (IS_ERR(keyring_key)) {
- res = PTR_ERR(keyring_key);
- keyring_key = NULL;
- goto out;
- }
- crypt_info->ci_keyring_key = keyring_key;
- BUG_ON(keyring_key->type != &key_type_logon);
- ukp = user_key_payload(keyring_key);
- if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
- res = -EINVAL;
- goto out;
- }
- master_key = (struct f2fs_encryption_key *)ukp->data;
- BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
- F2FS_KEY_DERIVATION_NONCE_SIZE);
- BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
- res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
- raw_key);
- if (res)
- goto out;
-
- ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
- if (!ctfm || IS_ERR(ctfm)) {
- res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- printk(KERN_DEBUG
- "%s: error %d (inode %u) allocating crypto tfm\n",
- __func__, res, (unsigned) inode->i_ino);
- goto out;
- }
- crypt_info->ci_ctfm = ctfm;
- crypto_ablkcipher_clear_flags(ctfm, ~0);
- crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
- CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_ablkcipher_setkey(ctfm, raw_key,
- f2fs_encryption_key_size(mode));
- if (res)
- goto out;
-
- memzero_explicit(raw_key, sizeof(raw_key));
- if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
- f2fs_free_crypt_info(crypt_info);
- goto retry;
- }
- return 0;
-
-out:
- if (res == -ENOKEY && !S_ISREG(inode->i_mode))
- res = 0;
-
- f2fs_free_crypt_info(crypt_info);
- memzero_explicit(raw_key, sizeof(raw_key));
- return res;
-}
-
-int f2fs_has_encryption_key(struct inode *inode)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
-
- return (fi->i_crypt_info != NULL);
-}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
deleted file mode 100644
index d4a96af513c2..000000000000
--- a/fs/f2fs/crypto_policy.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * copied from linux/fs/ext4/crypto_policy.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility.
- *
- * This contains encryption policy functions for f2fs with some modifications
- * to support f2fs-specific xattr APIs.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static int f2fs_inode_has_encryption_context(struct inode *inode)
-{
- int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
- return (res > 0);
-}
-
-/*
- * check whether the policy is consistent with the encryption context
- * for the inode
- */
-static int f2fs_is_encryption_context_consistent_with_policy(
- struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
- struct f2fs_encryption_context ctx;
- int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), NULL);
-
- if (res != sizeof(ctx))
- return 0;
-
- return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
- F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (ctx.flags == policy->flags) &&
- (ctx.contents_encryption_mode ==
- policy->contents_encryption_mode) &&
- (ctx.filenames_encryption_mode ==
- policy->filenames_encryption_mode));
-}
-
-static int f2fs_create_encryption_context_from_policy(
- struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
- struct f2fs_encryption_context ctx;
-
- ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
- F2FS_KEY_DESCRIPTOR_SIZE);
-
- if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid contents encryption mode %d\n", __func__,
- policy->contents_encryption_mode);
- return -EINVAL;
- }
-
- if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid filenames encryption mode %d\n", __func__,
- policy->filenames_encryption_mode);
- return -EINVAL;
- }
-
- if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
- return -EINVAL;
-
- ctx.contents_encryption_mode = policy->contents_encryption_mode;
- ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
- ctx.flags = policy->flags;
- BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
- get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-
- return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), NULL, XATTR_CREATE);
-}
-
-int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
- struct inode *inode)
-{
- if (policy->version != 0)
- return -EINVAL;
-
- if (!S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- if (!f2fs_inode_has_encryption_context(inode)) {
- if (!f2fs_empty_dir(inode))
- return -ENOTEMPTY;
- return f2fs_create_encryption_context_from_policy(inode,
- policy);
- }
-
- if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
- return 0;
-
- printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
- __func__);
- return -EINVAL;
-}
-
-int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
-{
- struct f2fs_encryption_context ctx;
- int res;
-
- if (!f2fs_encrypted_inode(inode))
- return -ENODATA;
-
- res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
- &ctx, sizeof(ctx), NULL);
- if (res != sizeof(ctx))
- return -ENODATA;
- if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
- return -EINVAL;
-
- policy->version = 0;
- policy->contents_encryption_mode = ctx.contents_encryption_mode;
- policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy->flags = ctx.flags;
- memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
- F2FS_KEY_DESCRIPTOR_SIZE);
- return 0;
-}
-
-int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
- struct inode *child)
-{
- struct f2fs_crypt_info *parent_ci, *child_ci;
- int res;
-
- if ((parent == NULL) || (child == NULL)) {
- pr_err("parent %p child %p\n", parent, child);
- BUG_ON(1);
- }
-
- /* no restrictions if the parent directory is not encrypted */
- if (!f2fs_encrypted_inode(parent))
- return 1;
- /* if the child directory is not encrypted, this is always a problem */
- if (!f2fs_encrypted_inode(child))
- return 0;
- res = f2fs_get_encryption_info(parent);
- if (res)
- return 0;
- res = f2fs_get_encryption_info(child);
- if (res)
- return 0;
- parent_ci = F2FS_I(parent)->i_crypt_info;
- child_ci = F2FS_I(child)->i_crypt_info;
- if (!parent_ci && !child_ci)
- return 1;
- if (!parent_ci || !child_ci)
- return 0;
-
- return (memcmp(parent_ci->ci_master_key,
- child_ci->ci_master_key,
- F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
- (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
- (parent_ci->ci_flags == child_ci->ci_flags));
-}
-
-/**
- * f2fs_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- *
- * Return: Zero on success, non-zero otherwise
- */
-int f2fs_inherit_context(struct inode *parent, struct inode *child,
- struct page *ipage)
-{
- struct f2fs_encryption_context ctx;
- struct f2fs_crypt_info *ci;
- int res;
-
- res = f2fs_get_encryption_info(parent);
- if (res < 0)
- return res;
-
- ci = F2FS_I(parent)->i_crypt_info;
- BUG_ON(ci == NULL);
-
- ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-
- ctx.contents_encryption_mode = ci->ci_data_mode;
- ctx.filenames_encryption_mode = ci->ci_filename_mode;
- ctx.flags = ci->ci_flags;
- memcpy(ctx.master_key_descriptor, ci->ci_master_key,
- F2FS_KEY_DESCRIPTOR_SIZE);
-
- get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
- return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), ipage, XATTR_CREATE);
-}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5c06db17e41f..e5c762b37239 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,9 +34,9 @@ static void f2fs_read_end_io(struct bio *bio)
if (f2fs_bio_encrypted(bio)) {
if (bio->bi_error) {
- f2fs_release_crypto_ctx(bio->bi_private);
+ fscrypt_release_ctx(bio->bi_private);
} else {
- f2fs_end_io_crypto_work(bio->bi_private, bio);
+ fscrypt_decrypt_bio_pages(bio->bi_private, bio);
return;
}
}
@@ -64,10 +64,9 @@ static void f2fs_write_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- f2fs_restore_and_release_control_page(&page);
+ fscrypt_pullback_bio_page(&page, true);
if (unlikely(bio->bi_error)) {
- set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
f2fs_stop_checkpoint(sbi);
}
@@ -75,8 +74,7 @@ static void f2fs_write_end_io(struct bio *bio)
dec_page_count(sbi, F2FS_WRITEBACK);
}
- if (!get_pages(sbi, F2FS_WRITEBACK) &&
- !list_empty(&sbi->cp_wait.task_list))
+ if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
bio_put(bio);
@@ -116,8 +114,54 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
io->bio = NULL;
}
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
- enum page_type type, int rw)
+static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
+ struct page *page, nid_t ino)
+{
+ struct bio_vec *bvec;
+ struct page *target;
+ int i;
+
+ if (!io->bio)
+ return false;
+
+ if (!inode && !page && !ino)
+ return true;
+
+ bio_for_each_segment_all(bvec, io->bio, i) {
+
+ if (bvec->bv_page->mapping)
+ target = bvec->bv_page;
+ else
+ target = fscrypt_control_page(bvec->bv_page);
+
+ if (inode && inode == target->mapping->host)
+ return true;
+ if (page && page == target)
+ return true;
+ if (ino && ino == ino_of_node(target))
+ return true;
+ }
+
+ return false;
+}
+
+static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct page *page, nid_t ino,
+ enum page_type type)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = &sbi->write_io[btype];
+ bool ret;
+
+ down_read(&io->io_rwsem);
+ ret = __has_merged_page(io, inode, page, ino);
+ up_read(&io->io_rwsem);
+ return ret;
+}
+
+static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, int rw)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io;
@@ -126,6 +170,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
down_write(&io->io_rwsem);
+ if (!__has_merged_page(io, inode, page, ino))
+ goto out;
+
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
@@ -135,9 +182,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
}
__submit_merged_bio(io);
+out:
up_write(&io->io_rwsem);
}
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
+ int rw)
+{
+ __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
+}
+
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, int rw)
+{
+ if (has_merged_page(sbi, inode, page, ino, type))
+ __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
+}
+
+void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+{
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ f2fs_submit_merged_bio(sbi, META, WRITE);
+}
+
/*
* Fill the locked page with data located in the block address.
* Return unlocked page.
@@ -145,13 +214,14 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio;
- struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+ struct page *page = fio->encrypted_page ?
+ fio->encrypted_page : fio->page;
trace_f2fs_submit_page_bio(page, fio);
f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
bio_put(bio);
@@ -172,21 +242,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
io = is_read ? &sbi->read_io : &sbi->write_io[btype];
- verify_block_addr(sbi, fio->blk_addr);
+ if (fio->old_blkaddr != NEW_ADDR)
+ verify_block_addr(sbi, fio->old_blkaddr);
+ verify_block_addr(sbi, fio->new_blkaddr);
down_write(&io->io_rwsem);
if (!is_read)
inc_page_count(sbi, F2FS_WRITEBACK);
- if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
+ if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
io->fio.rw != fio->rw))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
int bio_blocks = MAX_BIO_BLOCKS(sbi);
- io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
+ io->bio = __bio_alloc(sbi, fio->new_blkaddr,
+ bio_blocks, is_read);
io->fio = *fio;
}
@@ -198,7 +271,7 @@ alloc_new:
goto alloc_new;
}
- io->last_block_in_bio = fio->blk_addr;
+ io->last_block_in_bio = fio->new_blkaddr;
f2fs_trace_ios(fio, 0);
up_write(&io->io_rwsem);
@@ -218,7 +291,7 @@ void set_data_blkaddr(struct dnode_of_data *dn)
struct page *node_page = dn->node_page;
unsigned int ofs_in_node = dn->ofs_in_node;
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
rn = F2FS_NODE(node_page);
@@ -229,6 +302,13 @@ void set_data_blkaddr(struct dnode_of_data *dn)
dn->node_changed = true;
}
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
+{
+ dn->data_blkaddr = blkaddr;
+ set_data_blkaddr(dn);
+ f2fs_update_extent_cache(dn);
+}
+
int reserve_new_block(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
@@ -332,7 +412,7 @@ got_it:
return page;
}
- fio.blk_addr = dn.data_blkaddr;
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
fio.page = page;
err = f2fs_submit_page_bio(&fio);
if (err)
@@ -461,7 +541,6 @@ got_it:
static int __allocate_data_block(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
int seg = CURSEG_WARM_DATA;
@@ -489,7 +568,7 @@ alloc:
set_data_blkaddr(dn);
/* update i_size */
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
i_size_write(dn->inode,
@@ -497,67 +576,33 @@ alloc:
return 0;
}
-static int __allocate_data_blocks(struct inode *inode, loff_t offset,
- size_t count)
+ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dnode_of_data dn;
- u64 start = F2FS_BYTES_TO_BLK(offset);
- u64 len = F2FS_BYTES_TO_BLK(count);
- bool allocated;
- u64 end_offset;
- int err = 0;
-
- while (len) {
- f2fs_lock_op(sbi);
-
- /* When reading holes, we need its node page */
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, start, ALLOC_NODE);
- if (err)
- goto out;
-
- allocated = false;
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-
- while (dn.ofs_in_node < end_offset && len) {
- block_t blkaddr;
-
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
- goto sync_out;
- }
-
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
- err = __allocate_data_block(&dn);
- if (err)
- goto sync_out;
- allocated = true;
- }
- len--;
- start++;
- dn.ofs_in_node++;
- }
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct f2fs_map_blocks map;
+ ssize_t ret = 0;
- if (allocated)
- sync_inode_page(&dn);
+ map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos);
+ map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from));
+ map.m_next_pgofs = NULL;
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
+ if (f2fs_encrypted_inode(inode))
+ return 0;
- f2fs_balance_fs(sbi, dn.node_changed);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
}
- return err;
-
-sync_out:
- if (allocated)
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
-out:
- f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, dn.node_changed);
- return err;
+ if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ }
+ if (!f2fs_has_inline_data(inode))
+ return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ return ret;
}
/*
@@ -588,13 +633,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
/* it only supports block size == page size */
pgofs = (pgoff_t)map->m_lblk;
- if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
map->m_pblk = ei.blk + pgofs - ei.fofs;
map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
map->m_flags = F2FS_MAP_MAPPED;
goto out;
}
+next_dnode:
if (create)
f2fs_lock_op(sbi);
@@ -602,120 +648,98 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
err = 0;
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs =
+ get_next_page_offset(&dn, pgofs);
+ }
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+
+next_block:
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
if (create) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
- goto put_out;
+ goto sync_out;
+ }
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (blkaddr == NULL_ADDR)
+ err = reserve_new_block(&dn);
+ } else {
+ err = __allocate_data_block(&dn);
}
- err = __allocate_data_block(&dn);
if (err)
- goto put_out;
+ goto sync_out;
allocated = true;
map->m_flags = F2FS_MAP_NEW;
+ blkaddr = dn.data_blkaddr;
} else {
+ if (flag == F2FS_GET_BLOCK_FIEMAP &&
+ blkaddr == NULL_ADDR) {
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
+ }
if (flag != F2FS_GET_BLOCK_FIEMAP ||
- dn.data_blkaddr != NEW_ADDR) {
+ blkaddr != NEW_ADDR) {
if (flag == F2FS_GET_BLOCK_BMAP)
err = -ENOENT;
- goto put_out;
+ goto sync_out;
}
-
- /*
- * preallocated unwritten block should be mapped
- * for fiemap.
- */
- if (dn.data_blkaddr == NEW_ADDR)
- map->m_flags = F2FS_MAP_UNWRITTEN;
}
}
- map->m_flags |= F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- map->m_len = 1;
+ if (map->m_len == 0) {
+ /* preallocated unwritten block should be mapped for fiemap. */
+ if (blkaddr == NEW_ADDR)
+ map->m_flags |= F2FS_MAP_UNWRITTEN;
+ map->m_flags |= F2FS_MAP_MAPPED;
+
+ map->m_pblk = blkaddr;
+ map->m_len = 1;
+ } else if ((map->m_pblk != NEW_ADDR &&
+ blkaddr == (map->m_pblk + ofs)) ||
+ (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
+ flag == F2FS_GET_BLOCK_PRE_DIO ||
+ flag == F2FS_GET_BLOCK_PRE_AIO) {
+ ofs++;
+ map->m_len++;
+ } else {
+ goto sync_out;
+ }
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
dn.ofs_in_node++;
pgofs++;
-get_next:
- if (map->m_len >= maxblocks)
- goto sync_out;
+ if (map->m_len < maxblocks) {
+ if (dn.ofs_in_node < end_offset)
+ goto next_block;
- if (dn.ofs_in_node >= end_offset) {
if (allocated)
sync_inode_page(&dn);
- allocated = false;
f2fs_put_dnode(&dn);
if (create) {
f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, dn.node_changed);
- f2fs_lock_op(sbi);
- }
-
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, mode);
- if (err) {
- if (err == -ENOENT)
- err = 0;
- goto unlock_out;
- }
-
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
- }
-
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
-
- if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
- if (create) {
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
- goto sync_out;
- }
- err = __allocate_data_block(&dn);
- if (err)
- goto sync_out;
- allocated = true;
- map->m_flags |= F2FS_MAP_NEW;
- blkaddr = dn.data_blkaddr;
- } else {
- /*
- * we only merge preallocated unwritten blocks
- * for fiemap.
- */
- if (flag != F2FS_GET_BLOCK_FIEMAP ||
- blkaddr != NEW_ADDR)
- goto sync_out;
+ f2fs_balance_fs(sbi, allocated);
}
- }
-
- /* Give more consecutive addresses for the readahead */
- if ((map->m_pblk != NEW_ADDR &&
- blkaddr == (map->m_pblk + ofs)) ||
- (map->m_pblk == NEW_ADDR &&
- blkaddr == NEW_ADDR)) {
- ofs++;
- dn.ofs_in_node++;
- pgofs++;
- map->m_len++;
- goto get_next;
+ allocated = false;
+ goto next_dnode;
}
sync_out:
if (allocated)
sync_inode_page(&dn);
-put_out:
f2fs_put_dnode(&dn);
unlock_out:
if (create) {
f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, dn.node_changed);
+ f2fs_balance_fs(sbi, allocated);
}
out:
trace_f2fs_map_blocks(inode, map, err);
@@ -723,13 +747,15 @@ out:
}
static int __get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create, int flag)
+ struct buffer_head *bh, int create, int flag,
+ pgoff_t *next_pgofs)
{
struct f2fs_map_blocks map;
int ret;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
+ map.m_next_pgofs = next_pgofs;
ret = f2fs_map_blocks(inode, &map, create, flag);
if (!ret) {
@@ -741,16 +767,18 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
}
static int get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create, int flag)
+ struct buffer_head *bh_result, int create, int flag,
+ pgoff_t *next_pgofs)
{
- return __get_data_block(inode, iblock, bh_result, create, flag);
+ return __get_data_block(inode, iblock, bh_result, create,
+ flag, next_pgofs);
}
static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO);
+ F2FS_GET_BLOCK_DIO, NULL);
}
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -761,7 +789,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock,
return -EFBIG;
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_BMAP);
+ F2FS_GET_BLOCK_BMAP, NULL);
}
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -779,6 +807,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
struct buffer_head map_bh;
sector_t start_blk, last_blk;
+ pgoff_t next_pgofs;
loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
@@ -814,14 +843,15 @@ next:
map_bh.b_size = len;
ret = get_data_block(inode, start_blk, &map_bh, 0,
- F2FS_GET_BLOCK_FIEMAP);
+ F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
if (ret)
goto out;
/* HOLE */
if (!buffer_mapped(&map_bh)) {
+ start_blk = next_pgofs;
/* Go through holes util pass the EOF */
- if (blk_to_logical(inode, start_blk++) < isize)
+ if (blk_to_logical(inode, start_blk) < isize)
goto prep_next;
/* Found a hole beyond isize means no more extents.
* Note that the premise is that filesystems don't
@@ -889,6 +919,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
+ map.m_next_pgofs = NULL;
for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
@@ -927,7 +958,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_len = last_block - block_in_file;
if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_READ))
+ F2FS_GET_BLOCK_READ))
goto set_error_page;
}
got_it:
@@ -956,12 +987,12 @@ submit_and_realloc:
bio = NULL;
}
if (bio == NULL) {
- struct f2fs_crypto_ctx *ctx = NULL;
+ struct fscrypt_ctx *ctx = NULL;
if (f2fs_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- ctx = f2fs_get_crypto_ctx(inode);
+ ctx = fscrypt_get_ctx(inode);
if (IS_ERR(ctx))
goto set_error_page;
@@ -974,7 +1005,7 @@ submit_and_realloc:
min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
- f2fs_release_crypto_ctx(ctx);
+ fscrypt_release_ctx(ctx);
goto set_error_page;
}
bio->bi_bdev = bdev;
@@ -1052,10 +1083,10 @@ int do_write_data_page(struct f2fs_io_info *fio)
if (err)
return err;
- fio->blk_addr = dn.data_blkaddr;
+ fio->old_blkaddr = dn.data_blkaddr;
/* This page is already truncated */
- if (fio->blk_addr == NULL_ADDR) {
+ if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
goto out_writepage;
}
@@ -1064,9 +1095,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
/* wait for GCed encrypted page writeback */
f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
- fio->blk_addr);
+ fio->old_blkaddr);
- fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
goto out_writepage;
@@ -1079,7 +1110,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(fio->blk_addr != NEW_ADDR &&
+ if (unlikely(fio->old_blkaddr != NEW_ADDR &&
!is_cold_data(page) &&
!IS_ATOMIC_WRITTEN_PAGE(page) &&
need_inplace_update(inode))) {
@@ -1088,8 +1119,6 @@ int do_write_data_page(struct f2fs_io_info *fio)
trace_f2fs_do_write_data_page(page, IPU);
} else {
write_data_page(&dn, fio);
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
trace_f2fs_do_write_data_page(page, OPU);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
if (page->index == 0)
@@ -1177,12 +1206,18 @@ out:
inode_dec_dirty_pages(inode);
if (err)
ClearPageUptodate(page);
+
+ if (wbc->for_reclaim) {
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
+ remove_dirty_inode(inode);
+ }
+
unlock_page(page);
f2fs_balance_fs(sbi, need_balance_fs);
- if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) {
+
+ if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, DATA, WRITE);
- remove_dirty_inode(inode);
- }
+
return 0;
redirty_out:
@@ -1282,7 +1317,8 @@ continue_unlock:
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page,
+ DATA, true);
else
goto continue_unlock;
}
@@ -1339,8 +1375,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
int ret;
long diff;
- trace_f2fs_writepages(mapping->host, wbc, DATA);
-
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
@@ -1362,14 +1396,16 @@ static int f2fs_write_data_pages(struct address_space *mapping,
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
+
diff = nr_pages_to_write(sbi, DATA, wbc);
- if (!S_ISDIR(inode->i_mode)) {
+ if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) {
mutex_lock(&sbi->writepages);
locked = true;
}
ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
if (locked)
mutex_unlock(&sbi->writepages);
@@ -1380,6 +1416,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
skip_write:
wbc->pages_skipped += get_dirty_pages(inode);
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
return 0;
}
@@ -1406,6 +1443,14 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
struct extent_info ei;
int err = 0;
+ /*
+ * we already allocated all the blocks, so we don't need to get
+ * the block addresses when there is no need to fill the page.
+ */
+ if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
+ len == PAGE_CACHE_SIZE)
+ return 0;
+
if (f2fs_has_inline_data(inode) ||
(pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
f2fs_lock_op(sbi);
@@ -1425,7 +1470,7 @@ restart:
if (pos + len <= MAX_INLINE_DATA) {
read_inline_data(page, ipage);
set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- sync_inode_page(&dn);
+ set_inline_node(ipage);
} else {
err = f2fs_convert_inline_page(&dn, page);
if (err)
@@ -1439,13 +1484,9 @@ restart:
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
- bool restart = false;
-
/* hole case */
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err || (!err && dn.data_blkaddr == NULL_ADDR))
- restart = true;
- if (restart) {
+ if (err || (!err && dn.data_blkaddr == NULL_ADDR)) {
f2fs_put_dnode(&dn);
f2fs_lock_op(sbi);
locked = true;
@@ -1514,7 +1555,7 @@ repeat:
}
}
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, false);
/* wait for GCed encrypted page writeback */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -1541,7 +1582,8 @@ repeat:
.sbi = sbi,
.type = DATA,
.rw = READ_SYNC,
- .blk_addr = blkaddr,
+ .old_blkaddr = blkaddr,
+ .new_blkaddr = blkaddr,
.page = page,
.encrypted_page = NULL,
};
@@ -1561,7 +1603,7 @@ repeat:
/* avoid symlink page */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- err = f2fs_decrypt_one(inode, page);
+ err = fscrypt_decrypt_page(page);
if (err)
goto fail;
}
@@ -1592,7 +1634,6 @@ static int f2fs_write_end(struct file *file,
if (pos + copied > i_size_read(inode)) {
i_size_write(inode, pos + copied);
mark_inode_dirty(inode);
- update_inode_page(inode);
}
f2fs_put_page(page, 1);
@@ -1617,34 +1658,21 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
int err;
- /* we don't need to use inline_data strictly */
- err = f2fs_convert_inline_inode(inode);
+ err = check_direct_IO(inode, iter, offset);
if (err)
return err;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return 0;
- err = check_direct_IO(inode, iter, offset);
- if (err)
- return err;
-
trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == WRITE) {
- err = __allocate_data_blocks(inode, offset, count);
- if (err)
- goto out;
- }
-
err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
-out:
if (err < 0 && iov_iter_rw(iter) == WRITE)
f2fs_write_failed(mapping, offset + count);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index faa7495e2d7e..80641ad82745 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -77,7 +77,7 @@ static unsigned long dir_block_index(unsigned int level,
}
static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
- struct f2fs_filename *fname,
+ struct fscrypt_name *fname,
f2fs_hash_t namehash,
int *max_slots,
struct page **res_page)
@@ -103,15 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
return de;
}
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
f2fs_hash_t namehash, int *max_slots,
struct f2fs_dentry_ptr *d)
{
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
- struct f2fs_str de_name = FSTR_INIT(NULL, 0);
- struct f2fs_str *name = &fname->disk_name;
+ struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+ struct fscrypt_str *name = &fname->disk_name;
if (max_slots)
*max_slots = 0;
@@ -157,7 +157,7 @@ found:
static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int level,
- struct f2fs_filename *fname,
+ struct fscrypt_name *fname,
struct page **res_page)
{
struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
@@ -218,12 +218,12 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *de = NULL;
unsigned int max_depth;
unsigned int level;
- struct f2fs_filename fname;
+ struct fscrypt_name fname;
int err;
*res_page = NULL;
- err = f2fs_fname_setup_filename(dir, child, 1, &fname);
+ err = fscrypt_setup_filename(dir, child, 1, &fname);
if (err)
return NULL;
@@ -251,7 +251,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
break;
}
out:
- f2fs_fname_free_filename(&fname);
+ fscrypt_free_filename(&fname);
return de;
}
@@ -296,7 +296,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
- f2fs_wait_on_page_writeback(page, type);
+ f2fs_wait_on_page_writeback(page, type, true);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode->i_mode);
f2fs_dentry_kunmap(dir, page);
@@ -311,7 +311,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
@@ -341,24 +341,14 @@ int update_dent_inode(struct inode *inode, struct inode *to,
void do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d)
{
- struct f2fs_dir_entry *de;
-
- de = &d->dentry[0];
- de->name_len = cpu_to_le16(1);
- de->hash_code = 0;
- de->ino = cpu_to_le32(inode->i_ino);
- memcpy(d->filename[0], ".", 1);
- set_de_type(de, inode->i_mode);
+ struct qstr dot = QSTR_INIT(".", 1);
+ struct qstr dotdot = QSTR_INIT("..", 2);
- de = &d->dentry[1];
- de->hash_code = 0;
- de->name_len = cpu_to_le16(2);
- de->ino = cpu_to_le32(parent->i_ino);
- memcpy(d->filename[1], "..", 2);
- set_de_type(de, parent->i_mode);
+ /* update dirent of "." */
+ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
- test_and_set_bit_le(0, (void *)d->bitmap);
- test_and_set_bit_le(1, (void *)d->bitmap);
+ /* update dirent of ".." */
+ f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1);
}
static int make_empty_dir(struct inode *inode,
@@ -413,7 +403,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
goto put_error;
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
- err = f2fs_inherit_context(dir, inode, page);
+ err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
}
@@ -511,8 +501,12 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
memcpy(d->filename[bit_pos], name->name, name->len);
de->ino = cpu_to_le32(ino);
set_de_type(de, mode);
- for (i = 0; i < slots; i++)
+ for (i = 0; i < slots; i++) {
test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+ /* avoid wrong garbage data for readdir */
+ if (i)
+ (de + i)->name_len = 0;
+ }
}
/*
@@ -532,11 +526,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dentry_ptr d;
struct page *page = NULL;
- struct f2fs_filename fname;
+ struct fscrypt_name fname;
struct qstr new_name;
int slots, err;
- err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+ err = fscrypt_setup_filename(dir, name, 0, &fname);
if (err)
return err;
@@ -598,7 +592,7 @@ start:
++level;
goto start;
add_dentry:
- f2fs_wait_on_page_writeback(dentry_page, DATA);
+ f2fs_wait_on_page_writeback(dentry_page, DATA, true);
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
@@ -635,7 +629,7 @@ fail:
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
out:
- f2fs_fname_free_filename(&fname);
+ fscrypt_free_filename(&fname);
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
}
@@ -709,7 +703,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
return f2fs_delete_inline_entry(dentry, page, dir, inode);
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
dentry_blk = page_address(page);
bit_pos = dentry - dentry_blk->dentry;
@@ -777,12 +771,12 @@ bool f2fs_empty_dir(struct inode *dir)
}
bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
- unsigned int start_pos, struct f2fs_str *fstr)
+ unsigned int start_pos, struct fscrypt_str *fstr)
{
unsigned char d_type = DT_UNKNOWN;
unsigned int bit_pos;
struct f2fs_dir_entry *de = NULL;
- struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+ struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -792,6 +786,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
break;
de = &d->dentry[bit_pos];
+ if (de->name_len == 0) {
+ bit_pos++;
+ ctx->pos = start_pos + bit_pos;
+ continue;
+ }
+
if (de->file_type < F2FS_FT_MAX)
d_type = f2fs_filetype_table[de->file_type];
else
@@ -810,8 +810,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
memcpy(de_name.name, d->filename[bit_pos], de_name.len);
- ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
- &de_name, fstr);
+ ret = fscrypt_fname_disk_to_usr(d->inode,
+ (u32)de->hash_code, 0,
+ &de_name, fstr);
kfree(de_name.name);
if (ret < 0)
return true;
@@ -839,16 +840,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct file_ra_state *ra = &file->f_ra;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
struct f2fs_dentry_ptr d;
- struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
int err = 0;
if (f2fs_encrypted_inode(inode)) {
- err = f2fs_get_encryption_info(inode);
- if (err)
+ err = fscrypt_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
return err;
- err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
- &fstr);
+ err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
if (err < 0)
return err;
}
@@ -888,15 +888,23 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
f2fs_put_page(dentry_page, 1);
}
out:
- f2fs_fname_crypto_free_buffer(&fstr);
+ fscrypt_fname_free_buffer(&fstr);
return err;
}
+static int f2fs_dir_open(struct inode *inode, struct file *filp)
+{
+ if (f2fs_encrypted_inode(inode))
+ return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
+ return 0;
+}
+
const struct file_operations f2fs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate = f2fs_readdir,
.fsync = f2fs_sync_file,
+ .open = f2fs_dir_open,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ccd5c636d3fe..c859bb044728 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -33,6 +33,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
en->ei = *ei;
INIT_LIST_HEAD(&en->list);
+ en->et = et;
rb_link_node(&en->rb_node, parent, p);
rb_insert_color(&en->rb_node, &et->root);
@@ -50,6 +51,24 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
if (et->cached_en == en)
et->cached_en = NULL;
+ kmem_cache_free(extent_node_slab, en);
+}
+
+/*
+ * Flow to release an extent_node:
+ * 1. list_del_init
+ * 2. __detach_extent_node
+ * 3. kmem_cache_free.
+ */
+static void __release_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ spin_lock(&sbi->extent_lock);
+ f2fs_bug_on(sbi, list_empty(&en->list));
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
+
+ __detach_extent_node(sbi, et, en);
}
static struct extent_tree *__grab_extent_tree(struct inode *inode)
@@ -129,7 +148,7 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
}
static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, bool free_all)
+ struct extent_tree *et)
{
struct rb_node *node, *next;
struct extent_node *en;
@@ -139,18 +158,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
while (node) {
next = rb_next(node);
en = rb_entry(node, struct extent_node, rb_node);
-
- if (free_all) {
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list))
- list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
- }
-
- if (free_all || list_empty(&en->list)) {
- __detach_extent_node(sbi, et, en);
- kmem_cache_free(extent_node_slab, en);
- }
+ __release_extent_node(sbi, et, en);
node = next;
}
@@ -232,9 +240,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
if (en) {
*ei = en->ei;
spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list))
+ if (!list_empty(&en->list)) {
list_move_tail(&en->list, &sbi->extent_list);
- et->cached_en = en;
+ et->cached_en = en;
+ }
spin_unlock(&sbi->extent_lock);
ret = true;
}
@@ -329,7 +338,6 @@ lookup_neighbors:
static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
- struct extent_node **den,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
@@ -342,20 +350,25 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
}
if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
- if (en) {
- __detach_extent_node(sbi, et, prev_ex);
- *den = prev_ex;
- }
+ if (en)
+ __release_extent_node(sbi, et, prev_ex);
next_ex->ei.fofs = ei->fofs;
next_ex->ei.blk = ei->blk;
next_ex->ei.len += ei->len;
en = next_ex;
}
- if (en) {
- __try_update_largest_extent(et, en);
+ if (!en)
+ return NULL;
+
+ __try_update_largest_extent(et, en);
+
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
et->cached_en = en;
}
+ spin_unlock(&sbi->extent_lock);
return en;
}
@@ -391,7 +404,12 @@ do_insert:
return NULL;
__try_update_largest_extent(et, en);
+
+ /* update in global extent list */
+ spin_lock(&sbi->extent_lock);
+ list_add_tail(&en->list, &sbi->extent_list);
et->cached_en = en;
+ spin_unlock(&sbi->extent_lock);
return en;
}
@@ -479,7 +497,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (parts)
__try_update_largest_extent(et, en);
else
- __detach_extent_node(sbi, et, en);
+ __release_extent_node(sbi, et, en);
/*
* if original extent is split into zero or two parts, extent
@@ -490,31 +508,15 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
insert_p = NULL;
insert_parent = NULL;
}
-
- /* update in global extent list */
- spin_lock(&sbi->extent_lock);
- if (!parts && !list_empty(&en->list))
- list_del(&en->list);
- if (en1)
- list_add_tail(&en1->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
-
- /* release extent node */
- if (!parts)
- kmem_cache_free(extent_node_slab, en);
-
en = next_en;
}
/* 3. update extent in extent cache */
if (blkaddr) {
- struct extent_node *den = NULL;
set_extent_info(&ei, fofs, blkaddr, len);
- en1 = __try_merge_extent_node(sbi, et, &ei, &den,
- prev_en, next_en);
- if (!en1)
- en1 = __insert_extent_tree(sbi, et, &ei,
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
@@ -524,24 +526,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
et->largest.len = 0;
set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
}
-
- spin_lock(&sbi->extent_lock);
- if (en1) {
- if (list_empty(&en1->list))
- list_add_tail(&en1->list, &sbi->extent_list);
- else
- list_move_tail(&en1->list, &sbi->extent_list);
- }
- if (den && !list_empty(&den->list))
- list_del(&den->list);
- spin_unlock(&sbi->extent_lock);
-
- if (den)
- kmem_cache_free(extent_node_slab, den);
}
if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
- __free_extent_tree(sbi, et, true);
+ __free_extent_tree(sbi, et);
write_unlock(&et->lock);
@@ -550,14 +538,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
{
- struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
struct extent_tree *et, *next;
- struct extent_node *en, *tmp;
- unsigned long ino = F2FS_ROOT_INO(sbi);
- unsigned int found;
+ struct extent_node *en;
unsigned int node_cnt = 0, tree_cnt = 0;
int remained;
- bool do_free = false;
if (!test_opt(sbi, EXTENT_CACHE))
return 0;
@@ -572,10 +556,10 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
if (atomic_read(&et->node_cnt)) {
write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et, true);
+ node_cnt += __free_extent_tree(sbi, et);
write_unlock(&et->lock);
}
-
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
list_del_init(&et->list);
radix_tree_delete(&sbi->extent_tree_root, et->ino);
kmem_cache_free(extent_tree_slab, et);
@@ -585,6 +569,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
if (node_cnt + tree_cnt >= nr_shrink)
goto unlock_out;
+ cond_resched();
}
up_write(&sbi->extent_tree_lock);
@@ -596,42 +581,29 @@ free_node:
remained = nr_shrink - (node_cnt + tree_cnt);
spin_lock(&sbi->extent_lock);
- list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
- if (!remained--)
+ for (; remained > 0; remained--) {
+ if (list_empty(&sbi->extent_list))
break;
- list_del_init(&en->list);
- do_free = true;
- }
- spin_unlock(&sbi->extent_lock);
-
- if (do_free == false)
- goto unlock_out;
-
- /*
- * reset ino for searching victims from beginning of global extent tree.
- */
- ino = F2FS_ROOT_INO(sbi);
-
- while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
- (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
- unsigned i;
-
- ino = treevec[found - 1]->ino + 1;
- for (i = 0; i < found; i++) {
- struct extent_tree *et = treevec[i];
+ en = list_first_entry(&sbi->extent_list,
+ struct extent_node, list);
+ et = en->et;
+ if (!write_trylock(&et->lock)) {
+ /* refresh this extent node's position in extent list */
+ list_move_tail(&en->list, &sbi->extent_list);
+ continue;
+ }
- if (!atomic_read(&et->node_cnt))
- continue;
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
- if (write_trylock(&et->lock)) {
- node_cnt += __free_extent_tree(sbi, et, false);
- write_unlock(&et->lock);
- }
+ __detach_extent_node(sbi, et, en);
- if (node_cnt + tree_cnt >= nr_shrink)
- goto unlock_out;
- }
+ write_unlock(&et->lock);
+ node_cnt++;
+ spin_lock(&sbi->extent_lock);
}
+ spin_unlock(&sbi->extent_lock);
+
unlock_out:
up_write(&sbi->extent_tree_lock);
out:
@@ -650,7 +622,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
return 0;
write_lock(&et->lock);
- node_cnt = __free_extent_tree(sbi, et, true);
+ node_cnt = __free_extent_tree(sbi, et);
write_unlock(&et->lock);
return node_cnt;
@@ -701,19 +673,21 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
void f2fs_update_extent_cache(struct dnode_of_data *dn)
{
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
pgoff_t fofs;
+ block_t blkaddr;
if (!f2fs_may_extent_tree(dn->inode))
return;
- f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-
+ if (dn->data_blkaddr == NEW_ADDR)
+ blkaddr = NULL_ADDR;
+ else
+ blkaddr = dn->data_blkaddr;
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
- dn->ofs_in_node;
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ dn->ofs_in_node;
- if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1))
+ if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1))
sync_inode_page(dn);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ff79054c6cf6..bbe2cd1265d0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,10 +22,11 @@
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/fscrypto.h>
+#include <crypto/hash.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
-#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
#else
#define f2fs_bug_on(sbi, condition) \
do { \
@@ -34,7 +35,6 @@
set_sbi_flag(sbi, SBI_NEED_FSCK); \
} \
} while (0)
-#define f2fs_down_write(x, y) down_write(x)
#endif
/*
@@ -84,27 +84,6 @@ struct f2fs_mount_info {
#define F2FS_CLEAR_FEATURE(sb, mask) \
F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
-#define CRCPOLY_LE 0xedb88320
-
-static inline __u32 f2fs_crc32(void *buf, size_t len)
-{
- unsigned char *p = (unsigned char *)buf;
- __u32 crc = F2FS_SUPER_MAGIC;
- int i;
-
- while (len--) {
- crc ^= *p++;
- for (i = 0; i < 8; i++)
- crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
- }
- return crc;
-}
-
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
-{
- return f2fs_crc32(buf, buf_size) == blk_crc;
-}
-
/*
* For checkpoint manager
*/
@@ -183,37 +162,37 @@ struct fsync_inode_entry {
block_t last_inode; /* block address locating the last inode */
};
-#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
-#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
+#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
+#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits))
-#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
-#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
-#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
-#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno)
-#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
-#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
+#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
-static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
{
- int before = nats_in_cursum(rs);
- rs->n_nats = cpu_to_le16(before + i);
+ int before = nats_in_cursum(journal);
+ journal->n_nats = cpu_to_le16(before + i);
return before;
}
-static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
{
- int before = sits_in_cursum(rs);
- rs->n_sits = cpu_to_le16(before + i);
+ int before = sits_in_cursum(journal);
+ journal->n_sits = cpu_to_le16(before + i);
return before;
}
-static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
- int type)
+static inline bool __has_cursum_space(struct f2fs_journal *journal,
+ int size, int type)
{
if (type == NAT_JOURNAL)
- return size <= MAX_NAT_JENTRIES(sum);
- return size <= MAX_SIT_JENTRIES(sum);
+ return size <= MAX_NAT_JENTRIES(journal);
+ return size <= MAX_SIT_JENTRIES(journal);
}
/*
@@ -233,12 +212,9 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
-#define F2FS_IOC_SET_ENCRYPTION_POLICY \
- _IOR('f', 19, struct f2fs_encryption_policy)
-#define F2FS_IOC_GET_ENCRYPTION_PWSALT \
- _IOW('f', 20, __u8[16])
-#define F2FS_IOC_GET_ENCRYPTION_POLICY \
- _IOW('f', 21, struct f2fs_encryption_policy)
+#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
/*
* should be same as XFS_IOC_GOINGDOWN.
@@ -268,25 +244,6 @@ struct f2fs_defragment {
* For INODE and NODE manager
*/
/* for directory operations */
-struct f2fs_str {
- unsigned char *name;
- u32 len;
-};
-
-struct f2fs_filename {
- const struct qstr *usr_fname;
- struct f2fs_str disk_name;
- f2fs_hash_t hash;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- struct f2fs_str crypto_buf;
-#endif
-};
-
-#define FSTR_INIT(n, l) { .name = n, .len = l }
-#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p) ((p)->disk_name.name)
-#define fname_len(p) ((p)->disk_name.len)
-
struct f2fs_dentry_ptr {
struct inode *inode;
const void *bitmap;
@@ -354,6 +311,7 @@ struct extent_node {
struct rb_node rb_node; /* rb node located in rb-tree */
struct list_head list; /* node in global extent list of sbi */
struct extent_info ei; /* extent info */
+ struct extent_tree *et; /* extent tree pointer */
};
struct extent_tree {
@@ -382,6 +340,7 @@ struct f2fs_map_blocks {
block_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
};
/* for flag in get_data_block */
@@ -389,6 +348,8 @@ struct f2fs_map_blocks {
#define F2FS_GET_BLOCK_DIO 1
#define F2FS_GET_BLOCK_FIEMAP 2
#define F2FS_GET_BLOCK_BMAP 3
+#define F2FS_GET_BLOCK_PRE_DIO 4
+#define F2FS_GET_BLOCK_PRE_AIO 5
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -410,15 +371,6 @@ struct f2fs_map_blocks {
#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
-/* Encryption algorithms */
-#define F2FS_ENCRYPTION_MODE_INVALID 0
-#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1
-#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2
-#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3
-#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4
-
-#include "f2fs_crypto.h"
-
#define DEF_DIR_LEVEL 0
struct f2fs_inode_info {
@@ -442,13 +394,7 @@ struct f2fs_inode_info {
struct list_head dirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
-
struct extent_tree *extent_tree; /* cached extent_tree entry */
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- /* Encryption params */
- struct f2fs_crypt_info *i_crypt_info;
-#endif
};
static inline void get_extent_info(struct extent_info *ext,
@@ -515,6 +461,7 @@ struct f2fs_nm_info {
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
+ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -549,6 +496,8 @@ struct dnode_of_data {
unsigned int ofs_in_node; /* data offset in the node page */
bool inode_page_locked; /* inode page is locked or not */
bool node_changed; /* is node block changed */
+ char cur_level; /* level of hole node page */
+ char max_level; /* level of current page located */
block_t data_blkaddr; /* block address of the node block */
};
@@ -679,6 +628,7 @@ enum page_type {
META_FLUSH,
INMEM, /* the below types are used by tracepoints only. */
INMEM_DROP,
+ INMEM_REVOKE,
IPU,
OPU,
};
@@ -687,7 +637,8 @@ struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
- block_t blk_addr; /* block address to be written */
+ block_t new_blkaddr; /* new block address to be written */
+ block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
};
@@ -844,8 +795,22 @@ struct f2fs_sb_info {
struct list_head s_list;
struct mutex umount_mutex;
unsigned int shrinker_run_no;
+
+ /* For write statistics */
+ u64 sectors_written_start;
+ u64 kbytes_written;
+
+ /* Reference to checksum algorithm driver via cryptoapi */
+ struct crypto_shash *s_chksum_driver;
};
+/* For write statistics. Suppose sector size is 512 bytes,
+ * and the return value is in kbytes. s is of struct f2fs_sb_info.
+ */
+#define BD_PART_WRITTEN(s) \
+(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \
+ s->sectors_written_start) >> 1)
+
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
sbi->last_time[type] = jiffies;
@@ -874,6 +839,29 @@ static inline bool is_idle(struct f2fs_sb_info *sbi)
/*
* Inline functions
*/
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+ unsigned int length)
+{
+ SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
+ u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ int err;
+
+ shash->tfm = sbi->s_chksum_driver;
+ shash->flags = 0;
+ *ctx = F2FS_SUPER_MAGIC;
+
+ err = crypto_shash_update(shash, address, length);
+ BUG_ON(err);
+
+ return *ctx;
+}
+
+static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
+ void *buf, size_t buf_size)
+{
+ return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+}
+
static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
{
return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1006,7 +994,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
+ down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -1525,9 +1513,9 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
}
-static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+static inline unsigned int addrs_per_inode(struct inode *inode)
{
- if (f2fs_has_inline_xattr(&fi->vfs_inode))
+ if (f2fs_has_inline_xattr(inode))
return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
return DEF_ADDRS_PER_INODE;
}
@@ -1681,10 +1669,10 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \
- ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \
- (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \
- ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
+#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \
+ ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \
+ (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \
+ ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
/*
* file.c
@@ -1723,10 +1711,10 @@ struct dentry *f2fs_get_parent(struct dentry *child);
extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
void set_de_type(struct f2fs_dir_entry *, umode_t);
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
- unsigned int, struct f2fs_str *);
+ unsigned int, struct fscrypt_str *);
void do_make_empty_dir(struct inode *, struct inode *,
struct f2fs_dentry_ptr *);
struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -1763,6 +1751,7 @@ int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
extern __printf(3, 4)
void f2fs_msg(struct super_block *, const char *, const char *, ...);
+int sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
@@ -1780,6 +1769,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t);
bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int truncate_xattr_node(struct inode *, struct page *);
@@ -1811,7 +1801,8 @@ void destroy_node_manager_caches(void);
* segment.c
*/
void register_inmem_page(struct inode *, struct page *);
-int commit_inmem_pages(struct inode *, bool);
+void drop_inmem_pages(struct inode *);
+int commit_inmem_pages(struct inode *);
void f2fs_balance_fs(struct f2fs_sb_info *, bool);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1832,16 +1823,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(unsigned int, struct f2fs_io_info *);
void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
void rewrite_data_page(struct f2fs_io_info *);
+void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
+ block_t, block_t, bool, bool);
void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
- block_t, block_t, unsigned char, bool);
+ block_t, block_t, unsigned char, bool, bool);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_summary_block *,
- int, unsigned int, int);
+int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
int build_segment_manager(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
@@ -1881,11 +1873,16 @@ void destroy_checkpoint_caches(void);
* data.c
*/
void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
+ struct page *, nid_t, enum page_type, int);
+void f2fs_flush_merged_bios(struct f2fs_sb_info *);
int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
void set_data_blkaddr(struct dnode_of_data *);
+void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
+ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
@@ -1902,7 +1899,7 @@ int f2fs_release_page(struct page *, gfp_t);
*/
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
+block_t start_bidx_of_node(unsigned int, struct inode *);
int f2fs_gc(struct f2fs_sb_info *, bool);
void build_gc_manager(struct f2fs_sb_info *);
@@ -2093,7 +2090,7 @@ int f2fs_convert_inline_inode(struct inode *);
int f2fs_write_inline_data(struct inode *, struct page *);
bool recover_inline_data(struct inode *, struct page *);
struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
- struct f2fs_filename *, struct page **);
+ struct fscrypt_name *, struct page **);
struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
@@ -2102,7 +2099,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
struct inode *, struct inode *);
bool f2fs_empty_inline_dir(struct inode *);
int f2fs_read_inline_dir(struct file *, struct dir_context *,
- struct f2fs_str *);
+ struct fscrypt_str *);
int f2fs_inline_data_fiemap(struct inode *,
struct fiemap_extent_info *, __u64, __u64);
@@ -2132,13 +2129,9 @@ void destroy_extent_cache(void);
/*
* crypto support
*/
-static inline int f2fs_encrypted_inode(struct inode *inode)
+static inline bool f2fs_encrypted_inode(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
return file_is_encrypt(inode);
-#else
- return 0;
-#endif
}
static inline void f2fs_set_encrypted_inode(struct inode *inode)
@@ -2150,20 +2143,12 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
static inline bool f2fs_bio_encrypted(struct bio *bio)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- return unlikely(bio->bi_private != NULL);
-#else
- return false;
-#endif
+ return bio->bi_private != NULL;
}
static inline int f2fs_sb_has_crypto(struct super_block *sb)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
-#else
- return 0;
-#endif
}
static inline bool f2fs_may_encrypt(struct inode *inode)
@@ -2177,86 +2162,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#endif
}
-/* crypto_policy.c */
-int f2fs_is_child_context_consistent_with_parent(struct inode *,
- struct inode *);
-int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
-int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
-int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
-
-/* crypt.c */
-extern struct kmem_cache *f2fs_crypt_info_cachep;
-bool f2fs_valid_contents_enc_mode(uint32_t);
-uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
-struct page *f2fs_encrypt(struct inode *, struct page *);
-int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *);
-int f2fs_decrypt_one(struct inode *, struct page *);
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
-
-/* crypto_key.c */
-void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
-int _f2fs_get_encryption_info(struct inode *inode);
-
-/* crypto_fname.c */
-bool f2fs_valid_filenames_enc_mode(uint32_t);
-u32 f2fs_fname_crypto_round_up(u32, u32);
-int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
-int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
- const struct f2fs_str *, struct f2fs_str *);
-int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
- struct f2fs_str *);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-void f2fs_restore_and_release_control_page(struct page **);
-void f2fs_restore_control_page(struct page *);
-
-int __init f2fs_init_crypto(void);
-int f2fs_crypto_initialize(void);
-void f2fs_exit_crypto(void);
-
-int f2fs_has_encryption_key(struct inode *);
-
-static inline int f2fs_get_encryption_info(struct inode *inode)
-{
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
- if (!ci ||
- (ci->ci_keyring_key &&
- (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
- (1 << KEY_FLAG_REVOKED) |
- (1 << KEY_FLAG_DEAD)))))
- return _f2fs_get_encryption_info(inode);
- return 0;
-}
-
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
-int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
- int lookup, struct f2fs_filename *);
-void f2fs_fname_free_filename(struct f2fs_filename *);
-#else
-static inline void f2fs_restore_and_release_control_page(struct page **p) { }
-static inline void f2fs_restore_control_page(struct page *p) { }
-
-static inline int __init f2fs_init_crypto(void) { return 0; }
-static inline void f2fs_exit_crypto(void) { }
-
-static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
-static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
-static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
-
-static inline int f2fs_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup, struct f2fs_filename *fname)
-{
- memset(fname, 0, sizeof(struct f2fs_filename));
- fname->usr_fname = iname;
- fname->disk_name.name = (unsigned char *)iname->name;
- fname->disk_name.len = iname->len;
- return 0;
-}
-
-static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
+#ifndef CONFIG_F2FS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
+#define fscrypt_process_policy fscrypt_notsupp_process_policy
+#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
#endif
#endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
deleted file mode 100644
index c2c1c2b63b25..000000000000
--- a/fs/f2fs/f2fs_crypto.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * linux/fs/f2fs/f2fs_crypto.h
- *
- * Copied from linux/fs/ext4/ext4_crypto.h
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption header content for f2fs
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#ifndef _F2FS_CRYPTO_H
-#define _F2FS_CRYPTO_H
-
-#include <linux/fs.h>
-
-#define F2FS_KEY_DESCRIPTOR_SIZE 8
-
-/* Policy provided via an ioctl on the topmost directory */
-struct f2fs_encryption_policy {
- char version;
- char contents_encryption_mode;
- char filenames_encryption_mode;
- char flags;
- char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-} __attribute__((__packed__));
-
-#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
-#define F2FS_KEY_DERIVATION_NONCE_SIZE 16
-
-#define F2FS_POLICY_FLAGS_PAD_4 0x00
-#define F2FS_POLICY_FLAGS_PAD_8 0x01
-#define F2FS_POLICY_FLAGS_PAD_16 0x02
-#define F2FS_POLICY_FLAGS_PAD_32 0x03
-#define F2FS_POLICY_FLAGS_PAD_MASK 0x03
-#define F2FS_POLICY_FLAGS_VALID 0x03
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- * 1 byte: Protector format (1 = this version)
- * 1 byte: File contents encryption mode
- * 1 byte: File names encryption mode
- * 1 byte: Flags
- * 8 bytes: Master Key descriptor
- * 16 bytes: Encryption Key derivation nonce
- */
-struct f2fs_encryption_context {
- char format;
- char contents_encryption_mode;
- char filenames_encryption_mode;
- char flags;
- char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
- char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
-} __attribute__((__packed__));
-
-/* Encryption parameters */
-#define F2FS_XTS_TWEAK_SIZE 16
-#define F2FS_AES_128_ECB_KEY_SIZE 16
-#define F2FS_AES_256_GCM_KEY_SIZE 32
-#define F2FS_AES_256_CBC_KEY_SIZE 32
-#define F2FS_AES_256_CTS_KEY_SIZE 32
-#define F2FS_AES_256_XTS_KEY_SIZE 64
-#define F2FS_MAX_KEY_SIZE 64
-
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-
-struct f2fs_encryption_key {
- __u32 mode;
- char raw[F2FS_MAX_KEY_SIZE];
- __u32 size;
-} __attribute__((__packed__));
-
-struct f2fs_crypt_info {
- char ci_data_mode;
- char ci_filename_mode;
- char ci_flags;
- struct crypto_ablkcipher *ci_ctfm;
- struct key *ci_keyring_key;
- char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
-#define F2FS_WRITE_PATH_FL 0x00000002
-
-struct f2fs_crypto_ctx {
- union {
- struct {
- struct page *bounce_page; /* Ciphertext page */
- struct page *control_page; /* Original page */
- } w;
- struct {
- struct bio *bio;
- struct work_struct work;
- } r;
- struct list_head free_list; /* Free list */
- };
- char flags; /* Flags */
-};
-
-struct f2fs_completion_result {
- struct completion completion;
- int res;
-};
-
-#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
- struct f2fs_completion_result ecr = { \
- COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-static inline int f2fs_encryption_key_size(int mode)
-{
- switch (mode) {
- case F2FS_ENCRYPTION_MODE_AES_256_XTS:
- return F2FS_AES_256_XTS_KEY_SIZE;
- case F2FS_ENCRYPTION_MODE_AES_256_GCM:
- return F2FS_AES_256_GCM_KEY_SIZE;
- case F2FS_ENCRYPTION_MODE_AES_256_CBC:
- return F2FS_AES_256_CBC_KEY_SIZE;
- case F2FS_ENCRYPTION_MODE_AES_256_CTS:
- return F2FS_AES_256_CTS_KEY_SIZE;
- default:
- BUG();
- }
- return 0;
-}
-
-#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4
-#define F2FS_CRYPTO_BLOCK_SIZE 16
-#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct f2fs_encrypted_symlink_data {
- __le16 len;
- char encrypted_path[1];
-} __attribute__((__packed__));
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 encrypted_symlink_data_len(u32 l)
-{
- return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
-}
-#endif /* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ea272be62677..b41c3579ea9e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -86,7 +86,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
/* fill the page */
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, false);
/* wait for GCed encrypted page writeback */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -301,7 +301,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pagevec_init(&pvec, 0);
nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
PAGECACHE_TAG_DIRTY, 1);
- pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
+ pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
pagevec_release(&pvec);
return pgofs;
}
@@ -358,15 +358,14 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
} else if (err == -ENOENT) {
/* direct node does not exists */
if (whence == SEEK_DATA) {
- pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
- F2FS_I(inode));
+ pgofs = get_next_page_offset(&dn, pgofs);
continue;
} else {
goto found;
}
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
@@ -422,9 +421,11 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
int err;
if (f2fs_encrypted_inode(inode)) {
- err = f2fs_get_encryption_info(inode);
+ err = fscrypt_get_encryption_info(inode);
if (err)
return 0;
+ if (!f2fs_encrypted_inode(inode))
+ return -ENOKEY;
}
/* we don't need to use inline_data strictly */
@@ -440,12 +441,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int f2fs_file_open(struct inode *inode, struct file *filp)
{
int ret = generic_file_open(inode, filp);
+ struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
if (!ret && f2fs_encrypted_inode(inode)) {
- ret = f2fs_get_encryption_info(inode);
+ ret = fscrypt_get_encryption_info(inode);
if (ret)
- ret = -EACCES;
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
}
+ if (f2fs_encrypted_inode(dir) &&
+ !fscrypt_has_permitted_context(dir, inode))
+ return -EPERM;
return ret;
}
@@ -480,7 +487,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
* we will invalidate all blkaddr in the whole range.
*/
fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
- F2FS_I(dn->inode)) + ofs;
+ dn->inode) + ofs;
f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
sync_inode_page(dn);
@@ -521,9 +528,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
if (IS_ERR(page))
return 0;
truncate_out:
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
zero_user(page, offset, PAGE_CACHE_SIZE - offset);
- if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ if (!cache_only || !f2fs_encrypted_inode(inode) ||
+ !S_ISREG(inode->i_mode))
set_page_dirty(page);
f2fs_put_page(page, 1);
return 0;
@@ -568,7 +576,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
goto out;
}
- count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ count = ADDRS_PER_PAGE(dn.node_page, inode);
count -= dn.ofs_in_node;
f2fs_bug_on(sbi, count < 0);
@@ -671,7 +679,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE) {
if (f2fs_encrypted_inode(inode) &&
- f2fs_get_encryption_info(inode))
+ fscrypt_get_encryption_info(inode))
return -EACCES;
if (attr->ia_size <= i_size_read(inode)) {
@@ -743,7 +751,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
if (IS_ERR(page))
return PTR_ERR(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
zero_user(page, start, len);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -768,7 +776,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
return err;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -854,10 +862,8 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
} else {
new_addr = dn.data_blkaddr;
if (!is_checkpointed_data(sbi, new_addr)) {
- dn.data_blkaddr = NULL_ADDR;
/* do not invalidate this block address */
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, NULL_ADDR);
do_replace = true;
}
f2fs_put_dnode(&dn);
@@ -884,7 +890,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
get_node_info(sbi, dn.nid, &ni);
f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
- ni.version, true);
+ ni.version, true, false);
f2fs_put_dnode(&dn);
} else {
struct page *psrc, *pdst;
@@ -892,7 +898,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
psrc = get_lock_data_page(inode, src, true);
if (IS_ERR(psrc))
return PTR_ERR(psrc);
- pdst = get_new_data_page(inode, NULL, dst, false);
+ pdst = get_new_data_page(inode, NULL, dst, true);
if (IS_ERR(pdst)) {
f2fs_put_page(psrc, 1);
return PTR_ERR(pdst);
@@ -908,9 +914,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
err_out:
if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
- dn.data_blkaddr = new_addr;
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, new_addr);
f2fs_put_dnode(&dn);
}
return ret;
@@ -1050,12 +1054,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (dn.data_blkaddr != NEW_ADDR) {
invalidate_blocks(sbi, dn.data_blkaddr);
-
- dn.data_blkaddr = NEW_ADDR;
- set_data_blkaddr(&dn);
-
- dn.data_blkaddr = NULL_ADDR;
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, NEW_ADDR);
}
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
@@ -1253,7 +1252,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
{
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
@@ -1377,7 +1376,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- ret = commit_inmem_pages(inode, false);
+ ret = commit_inmem_pages(inode);
if (ret) {
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
goto err_out;
@@ -1440,7 +1439,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
}
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
@@ -1535,39 +1534,30 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- struct f2fs_encryption_policy policy;
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
- if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
- sizeof(policy)))
+ if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
+ sizeof(policy)))
return -EFAULT;
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return f2fs_process_policy(&policy, inode);
-#else
- return -EOPNOTSUPP;
-#endif
+ return fscrypt_process_policy(inode, &policy);
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- struct f2fs_encryption_policy policy;
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
int err;
- err = f2fs_get_policy(inode, &policy);
+ err = fscrypt_get_policy(inode, &policy);
if (err)
return err;
- if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
- sizeof(policy)))
+ if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
return -EFAULT;
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1648,7 +1638,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
struct f2fs_defragment *range)
{
struct inode *inode = file_inode(filp);
- struct f2fs_map_blocks map;
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
struct extent_info ei;
pgoff_t pg_start, pg_end;
unsigned int blk_per_seg = sbi->blocks_per_seg;
@@ -1874,14 +1864,32 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode = file_inode(iocb->ki_filp);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ ssize_t ret;
if (f2fs_encrypted_inode(inode) &&
- !f2fs_has_encryption_key(inode) &&
- f2fs_get_encryption_info(inode))
+ !fscrypt_has_encryption_key(inode) &&
+ fscrypt_get_encryption_info(inode))
return -EACCES;
- return generic_file_write_iter(iocb, from);
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0) {
+ ret = f2fs_preallocate_blocks(iocb, from);
+ if (!ret)
+ ret = __generic_file_write_iter(iocb, from);
+ }
+ inode_unlock(inode);
+
+ if (ret > 0) {
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
}
#ifdef CONFIG_COMPAT
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f610c2a9bdde..b0051a97824c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -245,6 +245,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
return get_cb_cost(sbi, segno);
}
+static unsigned int count_bits(const unsigned long *addr,
+ unsigned int offset, unsigned int len)
+{
+ unsigned int end = offset + len, sum = 0;
+
+ while (offset < end) {
+ if (test_bit(offset++, addr))
+ ++sum;
+ }
+ return sum;
+}
+
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
@@ -258,9 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int secno, max_cost;
+ unsigned int secno, max_cost, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
- int nsearched = 0;
+ unsigned int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
@@ -273,6 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
if (p.max_search == 0)
goto out;
+ last_victim = sbi->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -295,27 +308,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
p.offset = segno + p.ofs_unit;
- if (p.ofs_unit > 1)
+ if (p.ofs_unit > 1) {
p.offset -= segno % p.ofs_unit;
+ nsearched += count_bits(p.dirty_segmap,
+ p.offset - p.ofs_unit,
+ p.ofs_unit);
+ } else {
+ nsearched++;
+ }
+
secno = GET_SECNO(sbi, segno);
if (sec_usage_check(sbi, secno))
- continue;
+ goto next;
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
- continue;
+ goto next;
cost = get_gc_cost(sbi, segno, &p);
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
- } else if (unlikely(cost == max_cost)) {
- continue;
}
-
- if (nsearched++ >= p.max_search) {
- sbi->last_victim[p.gc_mode] = segno;
+next:
+ if (nsearched >= p.max_search) {
+ if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
+ sbi->last_victim[p.gc_mode] = last_victim + 1;
+ else
+ sbi->last_victim[p.gc_mode] = segno + 1;
break;
}
}
@@ -399,7 +420,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
-static int gc_node_segment(struct f2fs_sb_info *sbi,
+static void gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
bool initial = true;
@@ -419,7 +440,7 @@ next_step:
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
- return 0;
+ return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
@@ -446,7 +467,7 @@ next_step:
/* set page dirty and write it */
if (gc_type == FG_GC) {
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
set_page_dirty(node_page);
} else {
if (!PageWriteback(node_page))
@@ -460,20 +481,6 @@ next_step:
initial = false;
goto next_step;
}
-
- if (gc_type == FG_GC) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .for_reclaim = 0,
- };
- sync_node_pages(sbi, 0, &wbc);
-
- /* return 1 only if FG_GC succefully reclaimed one */
- if (get_valid_blocks(sbi, segno, 1) == 0)
- return 1;
- }
- return 0;
}
/*
@@ -483,7 +490,7 @@ next_step:
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
-block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
@@ -500,7 +507,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 5 - dec;
}
- return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
+ return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
}
static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -546,6 +553,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
struct f2fs_summary sum;
struct node_info ni;
struct page *page;
+ block_t newaddr;
int err;
/* do not read out */
@@ -567,21 +575,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* read page */
fio.page = page;
- fio.blk_addr = dn.data_blkaddr;
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
- fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
- fio.blk_addr,
- FGP_LOCK|FGP_CREAT,
- GFP_NOFS);
- if (!fio.encrypted_page)
- goto put_out;
+ allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+ &sum, CURSEG_COLD_DATA);
+
+ fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
+ FGP_LOCK | FGP_CREAT, GFP_NOFS);
+ if (!fio.encrypted_page) {
+ err = -ENOMEM;
+ goto recover_block;
+ }
err = f2fs_submit_page_bio(&fio);
if (err)
@@ -590,33 +601,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
/* write page */
lock_page(fio.encrypted_page);
- if (unlikely(!PageUptodate(fio.encrypted_page)))
+ if (unlikely(!PageUptodate(fio.encrypted_page))) {
+ err = -EIO;
goto put_page_out;
- if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
+ }
+ if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+ err = -EIO;
goto put_page_out;
+ }
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
set_page_writeback(fio.encrypted_page);
/* allocate block address */
- f2fs_wait_on_page_writeback(dn.node_page, NODE);
- allocate_data_block(fio.sbi, NULL, fio.blk_addr,
- &fio.blk_addr, &sum, CURSEG_COLD_DATA);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+
fio.rw = WRITE_SYNC;
+ fio.new_blkaddr = newaddr;
f2fs_submit_page_mbio(&fio);
- dn.data_blkaddr = fio.blk_addr;
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
if (page->index == 0)
set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
+recover_block:
+ if (err)
+ __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
+ true, true);
put_out:
f2fs_put_dnode(&dn);
out:
@@ -645,7 +662,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
.encrypted_page = NULL,
};
set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page))
inode_dec_dirty_pages(inode);
set_cold_data(page);
@@ -663,7 +680,7 @@ out:
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
-static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
@@ -686,7 +703,7 @@ next_step:
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
- return 0;
+ return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
@@ -719,7 +736,7 @@ next_step:
continue;
}
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+ start_bidx = start_bidx_of_node(nofs, inode);
data_page = get_read_data_page(inode,
start_bidx + ofs_in_node, READA, true);
if (IS_ERR(data_page)) {
@@ -735,7 +752,7 @@ next_step:
/* phase 3 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+ start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
move_encrypted_block(inode, start_bidx);
@@ -747,15 +764,6 @@ next_step:
if (++phase < 4)
goto next_step;
-
- if (gc_type == FG_GC) {
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
-
- /* return 1 only if FG_GC succefully reclaimed one */
- if (get_valid_blocks(sbi, segno, 1) == 0)
- return 1;
- }
- return 0;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -771,53 +779,92 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
return ret;
}
-static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static int do_garbage_collect(struct f2fs_sb_info *sbi,
+ unsigned int start_segno,
struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
struct blk_plug plug;
- int nfree = 0;
+ unsigned int segno = start_segno;
+ unsigned int end_segno = start_segno + sbi->segs_per_sec;
+ int seg_freed = 0;
+ unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+ SUM_TYPE_DATA : SUM_TYPE_NODE;
- /* read segment summary of victim */
- sum_page = get_sum_page(sbi, segno);
+ /* readahead multi ssa blocks those have contiguous address */
+ if (sbi->segs_per_sec > 1)
+ ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
+ sbi->segs_per_sec, META_SSA, true);
+
+ /* reference all summary page */
+ while (segno < end_segno) {
+ sum_page = get_sum_page(sbi, segno++);
+ unlock_page(sum_page);
+ }
blk_start_plug(&plug);
- sum = page_address(sum_page);
+ for (segno = start_segno; segno < end_segno; segno++) {
+ /* find segment summary of victim */
+ sum_page = find_get_page(META_MAPPING(sbi),
+ GET_SUM_BLOCK(sbi, segno));
+ f2fs_bug_on(sbi, !PageUptodate(sum_page));
+ f2fs_put_page(sum_page, 0);
- /*
- * this is to avoid deadlock:
- * - lock_page(sum_page) - f2fs_replace_block
- * - check_valid_map() - mutex_lock(sentry_lock)
- * - mutex_lock(sentry_lock) - change_curseg()
- * - lock_page(sum_page)
- */
- unlock_page(sum_page);
-
- switch (GET_SUM_TYPE((&sum->footer))) {
- case SUM_TYPE_NODE:
- nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
- break;
- case SUM_TYPE_DATA:
- nfree = gc_data_segment(sbi, sum->entries, gc_list,
- segno, gc_type);
- break;
+ sum = page_address(sum_page);
+ f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
+
+ /*
+ * this is to avoid deadlock:
+ * - lock_page(sum_page) - f2fs_replace_block
+ * - check_valid_map() - mutex_lock(sentry_lock)
+ * - mutex_lock(sentry_lock) - change_curseg()
+ * - lock_page(sum_page)
+ */
+
+ if (type == SUM_TYPE_NODE)
+ gc_node_segment(sbi, sum->entries, segno, gc_type);
+ else
+ gc_data_segment(sbi, sum->entries, gc_list, segno,
+ gc_type);
+
+ stat_inc_seg_count(sbi, type, gc_type);
+
+ f2fs_put_page(sum_page, 0);
+ }
+
+ if (gc_type == FG_GC) {
+ if (type == SUM_TYPE_NODE) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+ sync_node_pages(sbi, 0, &wbc);
+ } else {
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ }
}
+
blk_finish_plug(&plug);
- stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
+ if (gc_type == FG_GC) {
+ while (start_segno < end_segno)
+ if (get_valid_blocks(sbi, start_segno++, 1) == 0)
+ seg_freed++;
+ }
+
stat_inc_call_count(sbi->stat_info);
- f2fs_put_page(sum_page, 0);
- return nfree;
+ return seg_freed;
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
- unsigned int segno, i;
+ unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
- int sec_freed = 0;
+ int sec_freed = 0, seg_freed;
int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
@@ -838,30 +885,24 @@ gc_more:
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
gc_type = FG_GC;
+ /*
+ * If there is no victim and no prefree segment but still not
+ * enough free sections, we should flush dent/node blocks and do
+ * garbage collections.
+ */
if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
write_checkpoint(sbi, &cpc);
+ else if (has_not_enough_free_secs(sbi, 0))
+ write_checkpoint(sbi, &cpc);
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
goto stop;
ret = 0;
- /* readahead multi ssa blocks those have contiguous address */
- if (sbi->segs_per_sec > 1)
- ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
- META_SSA, true);
-
- for (i = 0; i < sbi->segs_per_sec; i++) {
- /*
- * for FG_GC case, halt gcing left segments once failed one
- * of segments in selected section to avoid long latency.
- */
- if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
- gc_type == FG_GC)
- break;
- }
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
- if (i == sbi->segs_per_sec && gc_type == FG_GC)
+ if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
sec_freed++;
if (gc_type == FG_GC)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index c3f0b7d4cfca..358214e9f707 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -71,7 +71,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
addr = inline_data_addr(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
memset(addr + from, 0, MAX_INLINE_DATA - from);
return true;
@@ -105,7 +105,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
- void *src_addr, *dst_addr;
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(dn->inode),
.type = DATA,
@@ -115,8 +114,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
};
int dirty, err;
- f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
-
if (!f2fs_exist_data(dn->inode))
goto clear_out;
@@ -124,21 +121,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
if (err)
return err;
- f2fs_wait_on_page_writeback(page, DATA);
-
- if (PageUptodate(page))
- goto no_update;
-
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
- /* Copy the whole inline data block */
- src_addr = inline_data_addr(dn->inode_page);
- dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- flush_dcache_page(page);
- kunmap_atomic(dst_addr);
- SetPageUptodate(page);
-no_update:
+ read_inline_data(page, dn->inode_page);
set_page_dirty(page);
/* clear dirty state */
@@ -146,11 +131,9 @@ no_update:
/* write data page to try to make data consistent */
set_page_writeback(page);
- fio.blk_addr = dn->data_blkaddr;
+ fio.old_blkaddr = dn->data_blkaddr;
write_data_page(dn, &fio);
- set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
if (dirty)
inode_dec_dirty_pages(dn->inode);
@@ -159,6 +142,7 @@ no_update:
/* clear inline data and flag after data writeback */
truncate_inline_inode(dn->inode_page, 0);
+ clear_inline_node(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
f2fs_clear_inline_inode(dn->inode);
@@ -223,7 +207,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
- f2fs_wait_on_page_writeback(dn.inode_page, NODE);
+ f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
src_addr = kmap_atomic(page);
dst_addr = inline_data_addr(dn.inode_page);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
@@ -233,6 +217,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
sync_inode_page(&dn);
+ clear_inline_node(dn.inode_page);
f2fs_put_dnode(&dn);
return 0;
}
@@ -261,7 +246,7 @@ process_inline:
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
src_addr = inline_data_addr(npage);
dst_addr = inline_data_addr(ipage);
@@ -292,7 +277,7 @@ process_inline:
}
struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
- struct f2fs_filename *fname, struct page **res_page)
+ struct fscrypt_name *fname, struct page **res_page)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct f2fs_inline_dentry *inline_dentry;
@@ -389,7 +374,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
if (err)
goto out;
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
dentry_blk = kmap_atomic(page);
@@ -469,7 +454,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
}
}
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
name_hash = f2fs_dentry_hash(name);
make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
@@ -507,7 +492,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
int i;
lock_page(page);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
inline_dentry = inline_data_addr(page);
bit_pos = dentry - inline_dentry->dentry;
@@ -550,7 +535,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
}
int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
- struct f2fs_str *fstr)
+ struct fscrypt_str *fstr)
{
struct inode *inode = file_inode(file);
struct f2fs_inline_dentry *inline_dentry = NULL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2adeff26be11..cb269c46ac25 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -83,7 +83,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
while (start < end) {
if (*start++) {
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
@@ -227,7 +227,7 @@ int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
ri = F2FS_INODE(node_page);
@@ -263,6 +263,10 @@ int update_inode(struct inode *inode, struct page *node_page)
set_cold_node(inode, node_page);
clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+ /* deleted inode */
+ if (inode->i_nlink == 0)
+ clear_inline_node(node_page);
+
return set_page_dirty(node_page);
}
@@ -320,7 +324,7 @@ void f2fs_evict_inode(struct inode *inode)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -385,10 +389,7 @@ no_delete:
}
}
out_clear:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (fi->i_crypt_info)
- f2fs_free_encryption_info(inode, fi->i_crypt_info);
-#endif
+ fscrypt_put_encryption_info(inode, NULL);
clear_inode(inode);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6f944e5eb76e..7876f1052101 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -169,7 +169,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
int err;
if (f2fs_encrypted_inode(dir) &&
- !f2fs_is_child_context_consistent_with_parent(dir, inode))
+ !fscrypt_has_permitted_context(dir, inode))
return -EPERM;
f2fs_balance_fs(sbi, true);
@@ -260,6 +260,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct page *page;
nid_t ino;
int err = 0;
+ unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
+
+ if (f2fs_encrypted_inode(dir)) {
+ int res = fscrypt_get_encryption_info(dir);
+
+ /*
+ * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ if (fscrypt_has_encryption_key(dir))
+ fscrypt_set_encrypted_dentry(dentry);
+ fscrypt_set_d_op(dentry);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
if (dentry->d_name.len > F2FS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -276,15 +292,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
return ERR_CAST(inode);
+ if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
+ err = __recover_dot_dentries(dir, root_ino);
+ if (err)
+ goto err_out;
+ }
+
if (f2fs_has_inline_dots(inode)) {
err = __recover_dot_dentries(inode, dir->i_ino);
if (err)
goto err_out;
}
+ if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+ !fscrypt_has_permitted_context(dir, inode)) {
+ bool nokey = f2fs_encrypted_inode(inode) &&
+ !fscrypt_has_encryption_key(inode);
+ err = nokey ? -ENOKEY : -EPERM;
+ goto err_out;
+ }
return d_splice_alias(inode, dentry);
err_out:
- iget_failed(inode);
+ iput(inode);
return ERR_PTR(err);
}
@@ -345,13 +375,23 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
size_t len = strlen(symname);
- size_t p_len;
- char *p_str;
- struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
- struct f2fs_encrypted_symlink_data *sd = NULL;
+ struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+ struct fscrypt_symlink_data *sd = NULL;
int err;
- if (len > dir->i_sb->s_blocksize)
+ if (f2fs_encrypted_inode(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ return err;
+
+ if (!fscrypt_has_encryption_key(dir))
+ return -EPERM;
+
+ disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+ sizeof(struct fscrypt_symlink_data));
+ }
+
+ if (disk_link.len > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
@@ -374,42 +414,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
- if (f2fs_encrypted_inode(dir)) {
+ if (f2fs_encrypted_inode(inode)) {
struct qstr istr = QSTR_INIT(symname, len);
+ struct fscrypt_str ostr;
- err = f2fs_get_encryption_info(inode);
- if (err)
+ sd = kzalloc(disk_link.len, GFP_NOFS);
+ if (!sd) {
+ err = -ENOMEM;
goto err_out;
+ }
- err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link);
+ err = fscrypt_get_encryption_info(inode);
if (err)
goto err_out;
- err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link);
- if (err < 0)
- goto err_out;
-
- p_len = encrypted_symlink_data_len(disk_link.len) + 1;
-
- if (p_len > dir->i_sb->s_blocksize) {
- err = -ENAMETOOLONG;
+ if (!fscrypt_has_encryption_key(inode)) {
+ err = -EPERM;
goto err_out;
}
- sd = kzalloc(p_len, GFP_NOFS);
- if (!sd) {
- err = -ENOMEM;
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+ err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err < 0)
goto err_out;
- }
- memcpy(sd->encrypted_path, disk_link.name, disk_link.len);
- sd->len = cpu_to_le16(disk_link.len);
- p_str = (char *)sd;
- } else {
- p_len = len + 1;
- p_str = (char *)symname;
+
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *)sd;
}
- err = page_symlink(inode, p_str, p_len);
+ err = page_symlink(inode, disk_link.name, disk_link.len);
err_out:
d_instantiate(dentry, inode);
@@ -425,7 +459,8 @@ err_out:
* performance regression.
*/
if (!err) {
- filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
+ filemap_write_and_wait_range(inode->i_mapping, 0,
+ disk_link.len - 1);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
@@ -434,7 +469,6 @@ err_out:
}
kfree(sd);
- f2fs_fname_crypto_free_buffer(&disk_link);
return err;
out:
handle_failed_inode(inode);
@@ -582,7 +616,7 @@ out:
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
if (f2fs_encrypted_inode(dir)) {
- int err = f2fs_get_encryption_info(dir);
+ int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
}
@@ -608,11 +642,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
+ bool is_old_inline = f2fs_has_inline_dentry(old_dir);
int err = -ENOENT;
if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
- !f2fs_is_child_context_consistent_with_parent(new_dir,
- old_inode)) {
+ !fscrypt_has_permitted_context(new_dir, old_inode)) {
err = -EPERM;
goto out;
}
@@ -654,8 +688,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto put_out_dir;
- if (update_dent_inode(old_inode, new_inode,
- &new_dentry->d_name)) {
+ err = update_dent_inode(old_inode, new_inode,
+ &new_dentry->d_name);
+ if (err) {
release_orphan_inode(sbi);
goto put_out_dir;
}
@@ -693,6 +728,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
inc_nlink(new_dir);
update_inode_page(new_dir);
}
+
+ /*
+ * old entry and new entry can locate in the same inline
+ * dentry in inode, when attaching new entry in inline dentry,
+ * it could force inline dentry conversion, after that,
+ * old_entry and old_page will point to wrong address, in
+ * order to avoid this, let's do the check and update here.
+ */
+ if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
+ f2fs_put_page(old_page, 0);
+ old_page = NULL;
+
+ old_entry = f2fs_find_entry(old_dir,
+ &old_dentry->d_name, &old_page);
+ if (!old_entry) {
+ err = -EIO;
+ f2fs_unlock_op(sbi);
+ goto out_whiteout;
+ }
+ }
}
down_write(&F2FS_I(old_inode)->i_sem);
@@ -771,11 +826,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
int err = -ENOENT;
if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
- (old_dir != new_dir) &&
- (!f2fs_is_child_context_consistent_with_parent(new_dir,
- old_inode) ||
- !f2fs_is_child_context_consistent_with_parent(old_dir,
- new_inode)))
+ (old_dir != new_dir) &&
+ (!fscrypt_has_permitted_context(new_dir, old_inode) ||
+ !fscrypt_has_permitted_context(old_dir, new_inode)))
return -EPERM;
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -937,16 +990,15 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
- struct f2fs_str cstr = FSTR_INIT(NULL, 0);
- struct f2fs_str pstr = FSTR_INIT(NULL, 0);
- struct f2fs_encrypted_symlink_data *sd;
+ struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
u32 max_size = inode->i_sb->s_blocksize;
int res;
@@ -954,7 +1006,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
if (!dentry)
return ERR_PTR(-ECHILD);
- res = f2fs_get_encryption_info(inode);
+ res = fscrypt_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -965,7 +1017,8 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
caddr[size] = 0;
/* Symlink is encrypted */
- sd = (struct f2fs_encrypted_symlink_data *)caddr;
+ sd = (struct fscrypt_symlink_data *)caddr;
+ cstr.name = sd->encrypted_path;
cstr.len = le16_to_cpu(sd->len);
/* this is broken symlink case */
@@ -973,12 +1026,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
res = -ENOENT;
goto errout;
}
- cstr.name = kmalloc(cstr.len, GFP_NOFS);
- if (!cstr.name) {
- res = -ENOMEM;
- goto errout;
- }
- memcpy(cstr.name, sd->encrypted_path, cstr.len);
/* this is broken symlink case */
if (unlikely(cstr.name[0] == 0)) {
@@ -986,22 +1033,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
goto errout;
}
- if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
- max_size) {
+ if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
/* Symlink data on the disk is corrupted */
res = -EIO;
goto errout;
}
- res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+ res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
if (res)
goto errout;
- res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+ res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
if (res < 0)
goto errout;
- kfree(cstr.name);
-
paddr = pstr.name;
/* Null-terminate the name */
@@ -1011,8 +1055,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
set_delayed_call(done, kfree_link, paddr);
return paddr;
errout:
- kfree(cstr.name);
- f2fs_fname_crypto_free_buffer(&pstr);
+ fscrypt_fname_free_buffer(&pstr);
page_cache_release(cpage);
return ERR_PTR(res);
}
@@ -1029,7 +1072,6 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.removexattr = generic_removexattr,
#endif
};
-#endif
const struct inode_operations f2fs_dir_inode_operations = {
.create = f2fs_create,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 342597a5897f..118321bd1a7f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -257,15 +257,20 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
return new;
}
-static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nat_entry *ne)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
e = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&e->ni, ne);
+ } else {
+ f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
+ nat_get_blkaddr(e) != ne->block_addr ||
+ nat_get_version(e) != ne->version);
}
}
@@ -354,7 +359,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = START_NID(nid);
struct f2fs_nat_block *nat_blk;
struct page *page = NULL;
@@ -371,23 +376,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- }
- up_read(&nm_i->nat_tree_lock);
- if (e)
+ up_read(&nm_i->nat_tree_lock);
return;
+ }
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
- down_write(&nm_i->nat_tree_lock);
-
/* Check current segment summary */
- mutex_lock(&curseg->curseg_mutex);
- i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+ down_read(&curseg->journal_rwsem);
+ i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
- ne = nat_in_journal(sum, i);
+ ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
if (i >= 0)
goto cache;
@@ -398,19 +400,52 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
+ up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
- cache_nat_entry(NM_I(sbi), nid, &ne);
+ down_write(&nm_i->nat_tree_lock);
+ cache_nat_entry(sbi, nid, &ne);
up_write(&nm_i->nat_tree_lock);
}
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
+{
+ const long direct_index = ADDRS_PER_INODE(dn->inode);
+ const long direct_blks = ADDRS_PER_BLOCK;
+ const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+ unsigned int skipped_unit = ADDRS_PER_BLOCK;
+ int cur_level = dn->cur_level;
+ int max_level = dn->max_level;
+ pgoff_t base = 0;
+
+ if (!dn->max_level)
+ return pgofs + 1;
+
+ while (max_level-- > cur_level)
+ skipped_unit *= NIDS_PER_BLOCK;
+
+ switch (dn->max_level) {
+ case 3:
+ base += 2 * indirect_blks;
+ case 2:
+ base += 2 * direct_blks;
+ case 1:
+ base += direct_index;
+ break;
+ default:
+ f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
+ }
+
+ return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
+}
+
/*
* The maximum depth is four.
* Offset[0] will have raw inode offset.
*/
-static int get_node_path(struct f2fs_inode_info *fi, long block,
+static int get_node_path(struct inode *inode, long block,
int offset[4], unsigned int noffset[4])
{
- const long direct_index = ADDRS_PER_INODE(fi);
+ const long direct_index = ADDRS_PER_INODE(inode);
const long direct_blks = ADDRS_PER_BLOCK;
const long dptrs_per_blk = NIDS_PER_BLOCK;
const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -495,10 +530,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
int offset[4];
unsigned int noffset[4];
nid_t nids[4];
- int level, i;
+ int level, i = 0;
int err = 0;
- level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
+ level = get_node_path(dn->inode, index, offset, noffset);
nids[0] = dn->inode->i_ino;
npage[0] = dn->inode_page;
@@ -585,6 +620,10 @@ release_pages:
release_out:
dn->inode_page = NULL;
dn->node_page = NULL;
+ if (err == -ENOENT) {
+ dn->cur_level = i;
+ dn->max_level = level;
+ }
return err;
}
@@ -792,7 +831,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
- level = get_node_path(F2FS_I(inode), from, offset, noffset);
+ level = get_node_path(inode, from, offset, noffset);
restart:
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -861,7 +900,7 @@ skip_partial:
f2fs_put_page(page, 1);
goto restart;
}
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
@@ -976,7 +1015,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
new_ni.ino = dn->inode->i_ino;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
SetPageUptodate(page);
@@ -1029,7 +1068,7 @@ static int read_node_page(struct page *page, int rw)
if (PageUptodate(page))
return LOCKED_PAGE;
- fio.blk_addr = ni.blk_addr;
+ fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
return f2fs_submit_page_bio(&fio);
}
@@ -1045,12 +1084,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
return;
f2fs_bug_on(sbi, check_nid_range(sbi, nid));
- apage = find_get_page(NODE_MAPPING(sbi), nid);
- if (apage && PageUptodate(apage)) {
- f2fs_put_page(apage, 0);
+ rcu_read_lock();
+ apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
+ rcu_read_unlock();
+ if (apage)
return;
- }
- f2fs_put_page(apage, 0);
apage = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!apage)
@@ -1063,7 +1101,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
/*
* readahead MAX_RA_NODE number of node pages.
*/
-void ra_node_pages(struct page *parent, int start)
+static void ra_node_pages(struct page *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
struct blk_plug plug;
@@ -1083,7 +1121,7 @@ void ra_node_pages(struct page *parent, int start)
blk_finish_plug(&plug);
}
-struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
struct page *parent, int start)
{
struct page *page;
@@ -1154,19 +1192,57 @@ void sync_inode_page(struct dnode_of_data *dn)
dn->node_changed = ret ? true: false;
}
+static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct inode *inode;
+ struct page *page;
+
+ /* should flush inline_data before evict_inode */
+ inode = ilookup(sbi->sb, ino);
+ if (!inode)
+ return;
+
+ page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0);
+ if (!page)
+ goto iput_out;
+
+ if (!trylock_page(page))
+ goto release_out;
+
+ if (!PageUptodate(page))
+ goto page_out;
+
+ if (!PageDirty(page))
+ goto page_out;
+
+ if (!clear_page_dirty_for_io(page))
+ goto page_out;
+
+ if (!f2fs_write_inline_data(inode, page))
+ inode_dec_dirty_pages(inode);
+ else
+ set_page_dirty(page);
+page_out:
+ unlock_page(page);
+release_out:
+ f2fs_put_page(page, 0);
+iput_out:
+ iput(inode);
+}
+
int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
struct writeback_control *wbc)
{
pgoff_t index, end;
struct pagevec pvec;
int step = ino ? 2 : 0;
- int nwritten = 0, wrote = 0;
+ int nwritten = 0;
pagevec_init(&pvec, 0);
next_step:
index = 0;
- end = LONG_MAX;
+ end = ULONG_MAX;
while (index <= end) {
int i, nr_pages;
@@ -1203,6 +1279,7 @@ next_step:
* If an fsync mode,
* we should not skip writing node pages.
*/
+lock_node:
if (ino && ino_of_node(page) == ino)
lock_page(page);
else if (!trylock_page(page))
@@ -1221,6 +1298,17 @@ continue_unlock:
goto continue_unlock;
}
+ /* flush inline_data */
+ if (!ino && is_inline_node(page)) {
+ clear_inline_node(page);
+ unlock_page(page);
+ flush_inline_data(sbi, ino_of_node(page));
+ goto lock_node;
+ }
+
+ f2fs_wait_on_page_writeback(page, NODE, true);
+
+ BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -1238,8 +1326,6 @@ continue_unlock:
if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
unlock_page(page);
- else
- wrote++;
if (--wbc->nr_to_write == 0)
break;
@@ -1257,15 +1343,12 @@ continue_unlock:
step++;
goto next_step;
}
-
- if (wrote)
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
return nwritten;
}
int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
{
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = ULONG_MAX;
struct pagevec pvec;
int ret2 = 0, ret = 0;
@@ -1287,7 +1370,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
continue;
if (ino && ino_of_node(page) == ino) {
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
if (TestClearPageError(page))
ret = -EIO;
}
@@ -1326,8 +1409,6 @@ static int f2fs_write_node_page(struct page *page,
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- f2fs_wait_on_page_writeback(page, NODE);
-
/* get old block addr of this node page */
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
@@ -1351,14 +1432,18 @@ static int f2fs_write_node_page(struct page *page,
}
set_page_writeback(page);
- fio.blk_addr = ni.blk_addr;
+ fio.old_blkaddr = ni.blk_addr;
write_node_page(nid, &fio);
- set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
+ set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
+
unlock_page(page);
- if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, NODE, WRITE);
return 0;
@@ -1374,8 +1459,6 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff;
- trace_f2fs_writepages(mapping->host, wbc, NODE);
-
/* balancing f2fs's metadata in background */
f2fs_balance_fs_bg(sbi);
@@ -1383,6 +1466,8 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
+
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
sync_node_pages(sbi, 0, wbc);
@@ -1391,6 +1476,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
skip_write:
wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
return 0;
}
@@ -1526,7 +1612,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i = 0;
nid_t nid = nm_i->next_scan_nid;
@@ -1558,16 +1644,18 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
nm_i->next_scan_nid = nid;
/* find free nids from current sum_pages */
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < nats_in_cursum(sum); i++) {
- block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
- nid = le32_to_cpu(nid_in_journal(sum, i));
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ block_t addr;
+
+ addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(journal, i));
if (addr == NULL_ADDR)
add_free_nid(sbi, nid, true);
else
remove_free_nid(nm_i, nid);
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
up_read(&nm_i->nat_tree_lock);
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -1703,7 +1791,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
src_addr = inline_xattr_addr(page);
inline_size = inline_xattr_size(inode);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
memcpy(dst_addr, src_addr, inline_size);
update_inode:
update_inode(inode, ipage);
@@ -1831,16 +1919,16 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i;
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < nats_in_cursum(sum); i++) {
+ down_write(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
struct nat_entry *ne;
struct f2fs_nat_entry raw_ne;
- nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+ nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
- raw_ne = nat_in_journal(sum, i);
+ raw_ne = nat_in_journal(journal, i);
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
@@ -1849,8 +1937,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
}
__set_nat_cache_dirty(nm_i, ne);
}
- update_nats_in_cursum(sum, -i);
- mutex_unlock(&curseg->curseg_mutex);
+ update_nats_in_cursum(journal, -i);
+ up_write(&curseg->journal_rwsem);
}
static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -1875,7 +1963,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
struct nat_entry_set *set)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
bool to_journal = true;
struct f2fs_nat_block *nat_blk;
@@ -1887,11 +1975,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+ if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
if (to_journal) {
- mutex_lock(&curseg->curseg_mutex);
+ down_write(&curseg->journal_rwsem);
} else {
page = get_next_nat_page(sbi, start_nid);
nat_blk = page_address(page);
@@ -1908,11 +1996,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
continue;
if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
+ offset = lookup_journal_in_cursum(journal,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(sbi, offset < 0);
- raw_ne = &nat_in_journal(sum, offset);
- nid_in_journal(sum, offset) = cpu_to_le32(nid);
+ raw_ne = &nat_in_journal(journal, offset);
+ nid_in_journal(journal, offset) = cpu_to_le32(nid);
} else {
raw_ne = &nat_blk->entries[nid - start_nid];
}
@@ -1924,7 +2012,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
if (to_journal)
- mutex_unlock(&curseg->curseg_mutex);
+ up_write(&curseg->journal_rwsem);
else
f2fs_put_page(page, 1);
@@ -1941,7 +2029,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
struct nat_entry_set *setvec[SETVEC_SIZE];
struct nat_entry_set *set, *tmp;
unsigned int found;
@@ -1958,7 +2046,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
while ((found = __gang_lookup_nat_set(nm_i,
@@ -1967,7 +2055,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
- MAX_NAT_JENTRIES(sum));
+ MAX_NAT_JENTRIES(journal));
}
/* flush dirty nats in nat entry set */
@@ -2000,6 +2088,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
+ nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d4d1f636fe1c..1f4f9d4569d9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,6 +25,9 @@
/* control the memory footprint threshold (10MB per 1GB ram) */
#define DEF_RAM_THRESHOLD 10
+/* control dirty nats ratio threshold (default: 10% over max nid count) */
+#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10
+
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
@@ -117,6 +120,12 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
raw_ne->version = ni->version;
}
+static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
+ NM_I(sbi)->dirty_nats_ratio / 100;
+}
+
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
@@ -321,7 +330,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
- f2fs_wait_on_page_writeback(p, NODE);
+ f2fs_wait_on_page_writeback(p, NODE, true);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
@@ -370,6 +379,21 @@ static inline int is_node(struct page *page, int type)
#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
+static inline int is_inline_node(struct page *page)
+{
+ return PageChecked(page);
+}
+
+static inline void set_inline_node(struct page *page)
+{
+ SetPageChecked(page);
+}
+
+static inline void clear_inline_node(struct page *page)
+{
+ ClearPageChecked(page);
+}
+
static inline void set_cold_node(struct inode *inode, struct page *page)
{
struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 589b20b8677b..0b30cd2aeebd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -350,8 +350,7 @@ got_it:
inode = dn->inode;
}
- bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
- le16_to_cpu(sum.ofs_in_node);
+ bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
/*
* if inode page is locked, unlock temporarily, but its reference
@@ -386,10 +385,9 @@ truncate_out:
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page, block_t blkaddr)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int start, end;
struct dnode_of_data dn;
struct node_info ni;
+ unsigned int start, end;
int err = 0, recovered = 0;
/* step 1: recover xattr */
@@ -409,8 +407,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
goto out;
/* step 3: recover data indices */
- start = start_bidx_of_node(ofs_of_node(page), fi);
- end = start + ADDRS_PER_PAGE(page, fi);
+ start = start_bidx_of_node(ofs_of_node(page), inode);
+ end = start + ADDRS_PER_PAGE(page, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -418,7 +416,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (err)
goto out;
- f2fs_wait_on_page_writeback(dn.node_page, NODE);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
get_node_info(sbi, dn.nid, &ni);
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
@@ -467,7 +465,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
- ni.version, false);
+ ni.version, false, false);
recovered++;
}
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5904a411c86f..6f16b39f0b52 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,70 +191,145 @@ void register_inmem_page(struct inode *inode, struct page *page)
trace_f2fs_register_inmem_page(page, INMEM);
}
-int commit_inmem_pages(struct inode *inode, bool abort)
+static int __revoke_inmem_pages(struct inode *inode,
+ struct list_head *head, bool drop, bool recover)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct inmem_pages *cur, *tmp;
+ int err = 0;
+
+ list_for_each_entry_safe(cur, tmp, head, list) {
+ struct page *page = cur->page;
+
+ if (drop)
+ trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+
+ lock_page(page);
+
+ if (recover) {
+ struct dnode_of_data dn;
+ struct node_info ni;
+
+ trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+ err = -EAGAIN;
+ goto next;
+ }
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ cur->old_addr, ni.version, true, true);
+ f2fs_put_dnode(&dn);
+ }
+next:
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPageUptodate(page);
+ f2fs_put_page(page, 1);
+
+ list_del(&cur->list);
+ kmem_cache_free(inmem_entry_slab, cur);
+ dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ }
+ return err;
+}
+
+void drop_inmem_pages(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ mutex_unlock(&fi->inmem_lock);
+}
+
+static int __commit_inmem_pages(struct inode *inode,
+ struct list_head *revoke_list)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *cur, *tmp;
- bool submit_bio = false;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
.encrypted_page = NULL,
};
+ bool submit_bio = false;
int err = 0;
- /*
- * The abort is true only when f2fs_evict_inode is called.
- * Basically, the f2fs_evict_inode doesn't produce any data writes, so
- * that we don't need to call f2fs_balance_fs.
- * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
- * inode becomes free by iget_locked in f2fs_iget.
- */
- if (!abort) {
- f2fs_balance_fs(sbi, true);
- f2fs_lock_op(sbi);
- }
-
- mutex_lock(&fi->inmem_lock);
list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
- lock_page(cur->page);
- if (!abort) {
- if (cur->page->mapping == inode->i_mapping) {
- set_page_dirty(cur->page);
- f2fs_wait_on_page_writeback(cur->page, DATA);
- if (clear_page_dirty_for_io(cur->page))
- inode_dec_dirty_pages(inode);
- trace_f2fs_commit_inmem_page(cur->page, INMEM);
- fio.page = cur->page;
- err = do_write_data_page(&fio);
- if (err) {
- unlock_page(cur->page);
- break;
- }
- clear_cold_data(cur->page);
- submit_bio = true;
+ struct page *page = cur->page;
+
+ lock_page(page);
+ if (page->mapping == inode->i_mapping) {
+ trace_f2fs_commit_inmem_page(page, INMEM);
+
+ set_page_dirty(page);
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ if (clear_page_dirty_for_io(page))
+ inode_dec_dirty_pages(inode);
+
+ fio.page = page;
+ err = do_write_data_page(&fio);
+ if (err) {
+ unlock_page(page);
+ break;
}
- } else {
- ClearPageUptodate(cur->page);
- trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
+
+ /* record old blkaddr for revoking */
+ cur->old_addr = fio.old_blkaddr;
+
+ clear_cold_data(page);
+ submit_bio = true;
}
- set_page_private(cur->page, 0);
- ClearPagePrivate(cur->page);
- f2fs_put_page(cur->page, 1);
+ unlock_page(page);
+ list_move_tail(&cur->list, revoke_list);
+ }
- list_del(&cur->list);
- kmem_cache_free(inmem_entry_slab, cur);
- dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ if (submit_bio)
+ f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
+
+ if (!err)
+ __revoke_inmem_pages(inode, revoke_list, false, false);
+
+ return err;
+}
+
+int commit_inmem_pages(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct list_head revoke_list;
+ int err;
+
+ INIT_LIST_HEAD(&revoke_list);
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+
+ mutex_lock(&fi->inmem_lock);
+ err = __commit_inmem_pages(inode, &revoke_list);
+ if (err) {
+ int ret;
+ /*
+ * try to revoke all committed pages, but still we could fail
+ * due to no memory or other reason, if that happened, EAGAIN
+ * will be returned, which means in such case, transaction is
+ * already not integrity, caller should use journal to do the
+ * recovery or rewrite & commit last transaction. For other
+ * error number, revoking was done by filesystem itself.
+ */
+ ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+ if (ret)
+ err = ret;
+
+ /* drop all uncommitted pages */
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
}
mutex_unlock(&fi->inmem_lock);
- if (!abort) {
- f2fs_unlock_op(sbi);
- if (submit_bio)
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- }
+ f2fs_unlock_op(sbi);
return err;
}
@@ -291,11 +366,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
- excess_prefree_segs(sbi) ||
!available_free_memory(sbi, INO_ENTRIES) ||
+ excess_prefree_segs(sbi) ||
+ excess_dirty_nats(sbi) ||
(is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
- if (test_opt(sbi, DATA_FLUSH))
+ if (test_opt(sbi, DATA_FLUSH)) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
sync_dirty_inodes(sbi, FILE_INODE);
+ blk_finish_plug(&plug);
+ }
f2fs_sync_fs(sbi->sb, true);
stat_inc_bg_cp_count(sbi->stat_info);
}
@@ -502,7 +583,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
{
- int err = -ENOTSUPP;
+ int err = -EOPNOTSUPP;
if (test_opt(sbi, DISCARD)) {
struct seg_entry *se = get_seg_entry(sbi,
@@ -841,6 +922,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
update_meta_page(sbi, (void *)sum_blk, blk_addr);
}
+static void write_current_sum_page(struct f2fs_sb_info *sbi,
+ int type, block_t blk_addr)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ struct page *page = grab_meta_page(sbi, blk_addr);
+ struct f2fs_summary_block *src = curseg->sum_blk;
+ struct f2fs_summary_block *dst;
+
+ dst = (struct f2fs_summary_block *)page_address(page);
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ down_read(&curseg->journal_rwsem);
+ memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
+ up_read(&curseg->journal_rwsem);
+
+ memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
+ memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
+
+ mutex_unlock(&curseg->curseg_mutex);
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -873,9 +979,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- MAIN_SEGS(sbi), *newseg + 1);
- if (segno - *newseg < sbi->segs_per_sec -
- (*newseg % sbi->segs_per_sec))
+ (hint + 1) * sbi->segs_per_sec, *newseg + 1);
+ if (segno < (hint + 1) * sbi->segs_per_sec)
goto got_it;
}
find_other_zone:
@@ -1280,8 +1385,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio->page, fio->type);
- allocate_data_block(fio->sbi, fio->page, fio->blk_addr,
- &fio->blk_addr, sum, type);
+ allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type);
/* writeout dirty page into bdev */
f2fs_submit_page_mbio(fio);
@@ -1293,7 +1398,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.sbi = sbi,
.type = META,
.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
- .blk_addr = page->index,
+ .old_blkaddr = page->index,
+ .new_blkaddr = page->index,
.page = page,
.encrypted_page = NULL,
};
@@ -1323,19 +1429,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
do_write_page(&sum, fio);
- dn->data_blkaddr = fio->blk_addr;
+ f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
}
void rewrite_data_page(struct f2fs_io_info *fio)
{
+ fio->new_blkaddr = fio->old_blkaddr;
stat_inc_inplace_blocks(fio->sbi);
f2fs_submit_page_mbio(fio);
}
-static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
- struct f2fs_summary *sum,
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
- bool recover_curseg)
+ bool recover_curseg, bool recover_newaddr)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
@@ -1378,7 +1484,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- if (!recover_curseg)
+ if (!recover_curseg || recover_newaddr)
update_sit_entry(sbi, new_blkaddr, 1);
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
update_sit_entry(sbi, old_blkaddr, -1);
@@ -1402,66 +1508,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
- unsigned char version, bool recover_curseg)
+ unsigned char version, bool recover_curseg,
+ bool recover_newaddr)
{
struct f2fs_summary sum;
set_summary(&sum, dn->nid, dn->ofs_in_node, version);
- __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+ __f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+ recover_curseg, recover_newaddr);
- dn->data_blkaddr = new_addr;
- set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
-}
-
-static inline bool is_merged_page(struct f2fs_sb_info *sbi,
- struct page *page, enum page_type type)
-{
- enum page_type btype = PAGE_TYPE_OF_BIO(type);
- struct f2fs_bio_info *io = &sbi->write_io[btype];
- struct bio_vec *bvec;
- struct page *target;
- int i;
-
- down_read(&io->io_rwsem);
- if (!io->bio) {
- up_read(&io->io_rwsem);
- return false;
- }
-
- bio_for_each_segment_all(bvec, io->bio, i) {
-
- if (bvec->bv_page->mapping) {
- target = bvec->bv_page;
- } else {
- struct f2fs_crypto_ctx *ctx;
-
- /* encrypted page */
- ctx = (struct f2fs_crypto_ctx *)page_private(
- bvec->bv_page);
- target = ctx->w.control_page;
- }
-
- if (page == target) {
- up_read(&io->io_rwsem);
- return true;
- }
- }
-
- up_read(&io->io_rwsem);
- return false;
+ f2fs_update_data_blkaddr(dn, new_addr);
}
void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type)
+ enum page_type type, bool ordered)
{
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- if (is_merged_page(sbi, page, type))
- f2fs_submit_merged_bio(sbi, type, WRITE);
- wait_on_page_writeback(page);
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
+ if (ordered)
+ wait_on_page_writeback(page);
+ else
+ wait_for_stable_page(page);
}
}
@@ -1477,7 +1547,7 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
if (cpage) {
- f2fs_wait_on_page_writeback(cpage, DATA);
+ f2fs_wait_on_page_writeback(cpage, DATA, true);
f2fs_put_page(cpage, 1);
}
}
@@ -1498,12 +1568,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
/* Step 1: restore nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
/* Step 2: restore sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
- SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
offset = 2 * SUM_JOURNAL_SIZE;
/* Step 3: restore summary entries */
@@ -1599,7 +1668,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
/* set uncompleted segment to curseg */
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
- memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+
+ /* update journal info */
+ down_write(&curseg->journal_rwsem);
+ memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
+ up_write(&curseg->journal_rwsem);
+
+ memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
+ memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
curseg->next_segno = segno;
reset_curseg(sbi, type, 0);
curseg->alloc_type = ckpt->alloc_type[type];
@@ -1654,13 +1730,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
/* Step 1: write nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+ memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 2: write sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
- SUM_JOURNAL_SIZE);
+ memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 3: write summary entries */
@@ -1706,12 +1781,8 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
else
end = type + NR_CURSEG_NODE_TYPE;
- for (i = type; i < end; i++) {
- struct curseg_info *sum = CURSEG_I(sbi, i);
- mutex_lock(&sum->curseg_mutex);
- write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
- mutex_unlock(&sum->curseg_mutex);
- }
+ for (i = type; i < end; i++)
+ write_current_sum_page(sbi, i, blkaddr + (i - type));
}
void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -1727,24 +1798,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
}
-int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
unsigned int val, int alloc)
{
int i;
if (type == NAT_JOURNAL) {
- for (i = 0; i < nats_in_cursum(sum); i++) {
- if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ if (le32_to_cpu(nid_in_journal(journal, i)) == val)
return i;
}
- if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL))
- return update_nats_in_cursum(sum, 1);
+ if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
+ return update_nats_in_cursum(journal, 1);
} else if (type == SIT_JOURNAL) {
- for (i = 0; i < sits_in_cursum(sum); i++)
- if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+ for (i = 0; i < sits_in_cursum(journal); i++)
+ if (le32_to_cpu(segno_in_journal(journal, i)) == val)
return i;
- if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL))
- return update_sits_in_cursum(sum, 1);
+ if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
+ return update_sits_in_cursum(journal, 1);
}
return -1;
}
@@ -1848,20 +1919,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi)
static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i;
- for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ down_write(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
unsigned int segno;
bool dirtied;
- segno = le32_to_cpu(segno_in_journal(sum, i));
+ segno = le32_to_cpu(segno_in_journal(journal, i));
dirtied = __mark_sit_entry_dirty(sbi, segno);
if (!dirtied)
add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
}
- update_sits_in_cursum(sum, -sits_in_cursum(sum));
+ update_sits_in_cursum(journal, -i);
+ up_write(&curseg->journal_rwsem);
}
/*
@@ -1873,13 +1946,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct sit_info *sit_i = SIT_I(sbi);
unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
struct sit_entry_set *ses, *tmp;
struct list_head *head = &SM_I(sbi)->sit_entry_set;
bool to_journal = true;
struct seg_entry *se;
- mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
if (!sit_i->dirty_sentries)
@@ -1896,7 +1968,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and add and account
* them in sit entry set.
*/
- if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
remove_sits_in_journal(sbi);
/*
@@ -1913,10 +1985,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned int segno = start_segno;
if (to_journal &&
- !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+ !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
to_journal = false;
- if (!to_journal) {
+ if (to_journal) {
+ down_write(&curseg->journal_rwsem);
+ } else {
page = get_next_sit_page(sbi, start_segno);
raw_sit = page_address(page);
}
@@ -1934,13 +2008,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
+ offset = lookup_journal_in_cursum(journal,
SIT_JOURNAL, segno, 1);
f2fs_bug_on(sbi, offset < 0);
- segno_in_journal(sum, offset) =
+ segno_in_journal(journal, offset) =
cpu_to_le32(segno);
seg_info_to_raw_sit(se,
- &sit_in_journal(sum, offset));
+ &sit_in_journal(journal, offset));
} else {
sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
seg_info_to_raw_sit(se,
@@ -1952,7 +2026,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ses->entry_cnt--;
}
- if (!to_journal)
+ if (to_journal)
+ up_write(&curseg->journal_rwsem);
+ else
f2fs_put_page(page, 1);
f2fs_bug_on(sbi, ses->entry_cnt);
@@ -1967,7 +2043,6 @@ out:
add_discard_addrs(sbi, cpc);
}
mutex_unlock(&sit_i->sentry_lock);
- mutex_unlock(&curseg->curseg_mutex);
set_prefree_as_free_segments(sbi);
}
@@ -2099,6 +2174,11 @@ static int build_curseg(struct f2fs_sb_info *sbi)
array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
+ init_rwsem(&array[i].journal_rwsem);
+ array[i].journal = kzalloc(sizeof(struct f2fs_journal),
+ GFP_KERNEL);
+ if (!array[i].journal)
+ return -ENOMEM;
array[i].segno = NULL_SEGNO;
array[i].next_blkoff = 0;
}
@@ -2109,11 +2189,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(sbi);
+ int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
do {
readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
@@ -2127,16 +2207,16 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
struct f2fs_sit_entry sit;
struct page *page;
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < sits_in_cursum(sum); i++) {
- if (le32_to_cpu(segno_in_journal(sum, i))
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
+ if (le32_to_cpu(segno_in_journal(journal, i))
== start) {
- sit = sit_in_journal(sum, i);
- mutex_unlock(&curseg->curseg_mutex);
+ sit = sit_in_journal(journal, i);
+ up_read(&curseg->journal_rwsem);
goto got_it;
}
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
page = get_current_sit_page(sbi, start);
sit_blk = (struct f2fs_sit_block *)page_address(page);
@@ -2371,8 +2451,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
if (!array)
return;
SM_I(sbi)->curseg_array = NULL;
- for (i = 0; i < NR_CURSEG_TYPE; i++)
+ for (i = 0; i < NR_CURSEG_TYPE; i++) {
kfree(array[i].sum_blk);
+ kfree(array[i].journal);
+ }
kfree(array);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d346ea44..975c33df65c7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -183,7 +183,7 @@ struct segment_allocation {
* this value is set in page as a private data which indicate that
* the page is atomically written, and it is in inmem_pages list.
*/
-#define ATOMIC_WRITTEN_PAGE 0x0000ffff
+#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
#define IS_ATOMIC_WRITTEN_PAGE(page) \
(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
@@ -191,6 +191,7 @@ struct segment_allocation {
struct inmem_pages {
struct list_head list;
struct page *page;
+ block_t old_addr; /* for revoking when fail to commit */
};
struct sit_info {
@@ -257,6 +258,8 @@ struct victim_selection {
struct curseg_info {
struct mutex curseg_mutex; /* lock for consistency */
struct f2fs_summary_block *sum_blk; /* cached summary block */
+ struct rw_semaphore journal_rwsem; /* protect journal area */
+ struct f2fs_journal *journal; /* cached journal info */
unsigned char alloc_type; /* current allocation type */
unsigned int segno; /* current segment number */
unsigned short next_blkoff; /* next block offset to write */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6134832baaaf..15bb81f8dac2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -126,6 +126,19 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return NULL;
}
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->kbytes_written +
+ BD_PART_WRITTEN(sbi)));
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -204,6 +217,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \
f2fs_sbi_show, f2fs_sbi_store, \
offsetof(struct struct_name, elname))
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
@@ -216,10 +232,12 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -237,8 +255,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(dirty_nats_ratio),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
+ ATTR_LIST(lifetime_write_kbytes),
NULL,
};
@@ -450,10 +470,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- fi->i_crypt_info = NULL;
-#endif
return &fi->vfs_inode;
}
@@ -474,7 +490,7 @@ static int f2fs_drop_inode(struct inode *inode)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -487,11 +503,7 @@ static int f2fs_drop_inode(struct inode *inode)
sb_end_intwrite(inode->i_sb);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (F2FS_I(inode)->i_crypt_info)
- f2fs_free_encryption_info(inode,
- F2FS_I(inode)->i_crypt_info);
-#endif
+ fscrypt_put_encryption_info(inode, NULL);
spin_lock(&inode->i_lock);
atomic_dec(&inode->i_count);
}
@@ -562,6 +574,10 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
+ /* our cp_error case, we can wait for any writeback page */
+ if (get_pages(sbi, F2FS_WRITEBACK))
+ f2fs_flush_merged_bios(sbi);
+
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -574,6 +590,8 @@ static void f2fs_put_super(struct super_block *sb)
wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->raw_super);
kfree(sbi);
}
@@ -766,8 +784,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_stop_gc = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
- sync_filesystem(sb);
-
/*
* Save the old mount options in case we
* need to restore them.
@@ -775,6 +791,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
+ if (*flags & MS_RDONLY) {
+ set_opt(sbi, FASTBOOT);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ }
+
+ sync_filesystem(sb);
+
sbi->mount_opt.opt = 0;
default_options(sbi);
@@ -862,6 +885,41 @@ static struct super_operations f2fs_sops = {
.remount_fs = f2fs_remount,
};
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, NULL);
+}
+
+static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, fs_data, XATTR_CREATE);
+}
+
+static unsigned f2fs_max_namelen(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ?
+ inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+}
+
+static struct fscrypt_operations f2fs_cryptops = {
+ .get_context = f2fs_get_context,
+ .set_context = f2fs_set_context,
+ .is_encrypted = f2fs_encrypted_inode,
+ .empty_dir = f2fs_empty_dir,
+ .max_namelen = f2fs_max_namelen,
+};
+#else
+static struct fscrypt_operations f2fs_cryptops = {
+ .is_encrypted = f2fs_encrypted_inode,
+};
+#endif
+
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@@ -1074,7 +1132,7 @@ static int sanity_check_raw_super(struct super_block *sb,
return 0;
}
-static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int sanity_check_ckpt(struct f2fs_sb_info *sbi)
{
unsigned int total, fsmeta;
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -1134,14 +1192,15 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
/*
* Read f2fs raw super block.
- * Because we have two copies of super block, so read the first one at first,
- * if the first one is invalid, move to read the second one.
+ * Because we have two copies of super block, so read both of them
+ * to get the first valid one. If any one of them is broken, we pass
+ * them recovery flag back to the caller.
*/
static int read_raw_super_block(struct super_block *sb,
struct f2fs_super_block **raw_super,
int *valid_super_block, int *recovery)
{
- int block = 0;
+ int block;
struct buffer_head *bh;
struct f2fs_super_block *super, *buf;
int err = 0;
@@ -1149,50 +1208,48 @@ static int read_raw_super_block(struct super_block *sb,
super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
if (!super)
return -ENOMEM;
-retry:
- bh = sb_bread(sb, block);
- if (!bh) {
- *recovery = 1;
- f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+
+ for (block = 0; block < 2; block++) {
+ bh = sb_bread(sb, block);
+ if (!bh) {
+ f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
block + 1);
- err = -EIO;
- goto next;
- }
+ err = -EIO;
+ continue;
+ }
- buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET);
+ buf = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
- /* sanity checking of raw super */
- if (sanity_check_raw_super(sb, buf)) {
- brelse(bh);
- *recovery = 1;
- f2fs_msg(sb, KERN_ERR,
- "Can't find valid F2FS filesystem in %dth superblock",
- block + 1);
- err = -EINVAL;
- goto next;
- }
+ /* sanity checking of raw super */
+ if (sanity_check_raw_super(sb, buf)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Can't find valid F2FS filesystem in %dth superblock",
+ block + 1);
+ err = -EINVAL;
+ brelse(bh);
+ continue;
+ }
- if (!*raw_super) {
- memcpy(super, buf, sizeof(*super));
- *valid_super_block = block;
- *raw_super = super;
+ if (!*raw_super) {
+ memcpy(super, buf, sizeof(*super));
+ *valid_super_block = block;
+ *raw_super = super;
+ }
+ brelse(bh);
}
- brelse(bh);
-next:
- /* check the validity of the second superblock */
- if (block == 0) {
- block++;
- goto retry;
- }
+ /* Fail to read any one of the superblocks*/
+ if (err < 0)
+ *recovery = 1;
/* No valid superblock */
- if (!*raw_super) {
+ if (!*raw_super)
kfree(super);
- return err;
- }
+ else
+ err = 0;
- return 0;
+ return err;
}
static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
@@ -1242,6 +1299,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
bool retry = true, need_fsck = false;
char *options = NULL;
int recovery, i, valid_super_block;
+ struct curseg_info *seg_i;
try_onemore:
err = -EINVAL;
@@ -1254,6 +1312,15 @@ try_onemore:
if (!sbi)
return -ENOMEM;
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
+ err = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto free_sbi;
+ }
+
/* set a block size */
if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -1285,6 +1352,7 @@ try_onemore:
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
sb->s_op = &f2fs_sops;
+ sb->s_cop = &f2fs_cryptops;
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1333,13 +1401,6 @@ try_onemore:
goto free_meta_inode;
}
- /* sanity checking of checkpoint */
- err = -EINVAL;
- if (sanity_check_ckpt(sbi)) {
- f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
- goto free_cp;
- }
-
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
sbi->total_valid_inode_count =
@@ -1372,6 +1433,17 @@ try_onemore:
goto free_nm;
}
+ /* For write statistics */
+ if (sb->s_bdev->bd_part)
+ sbi->sectors_written_start =
+ (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+
+ /* Read accumulated write IO statistics if exists */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+ if (__exist_node_summaries(sbi))
+ sbi->kbytes_written =
+ le64_to_cpu(seg_i->sum_blk->journal.info.kbytes_written);
+
build_gc_manager(sbi);
/* get an inode for node space */
@@ -1466,8 +1538,10 @@ try_onemore:
/* recover broken superblock */
if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
- f2fs_msg(sb, KERN_INFO, "Recover invalid superblock");
- f2fs_commit_super(sbi, true);
+ err = f2fs_commit_super(sbi, true);
+ f2fs_msg(sb, KERN_INFO,
+ "Try to recover %dth superblock, ret: %ld",
+ sbi->valid_super_block ? 1 : 2, err);
}
f2fs_update_time(sbi, CP_TIME);
@@ -1496,7 +1570,6 @@ free_nm:
destroy_node_manager(sbi);
free_sm:
destroy_segment_manager(sbi);
-free_cp:
kfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
@@ -1506,6 +1579,8 @@ free_options:
free_sb_buf:
kfree(raw_super);
free_sbi:
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi);
/* give only one another chance */
@@ -1585,13 +1660,9 @@ static int __init init_f2fs_fs(void)
err = -ENOMEM;
goto free_extent_cache;
}
- err = f2fs_init_crypto();
- if (err)
- goto free_kset;
-
err = register_shrinker(&f2fs_shrinker_info);
if (err)
- goto free_crypto;
+ goto free_kset;
err = register_filesystem(&f2fs_fs_type);
if (err)
@@ -1606,8 +1677,6 @@ free_filesystem:
unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
-free_crypto:
- f2fs_exit_crypto();
free_kset:
kset_unregister(f2fs_kset);
free_extent_cache:
@@ -1630,7 +1699,6 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_root_stats();
unregister_shrinker(&f2fs_shrinker_info);
unregister_filesystem(&f2fs_fs_type);
- f2fs_exit_crypto();
destroy_extent_cache();
destroy_checkpoint_caches();
destroy_segment_manager_caches();
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 145fb659ad44..562ce0821559 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -29,7 +29,8 @@ static inline void __print_last_io(void)
last_io.major, last_io.minor,
last_io.pid, "----------------",
last_io.type,
- last_io.fio.rw, last_io.fio.blk_addr,
+ last_io.fio.rw,
+ last_io.fio.new_blkaddr,
last_io.len);
memset(&last_io, 0, sizeof(last_io));
}
@@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
last_io.pid == pid &&
last_io.type == __file_type(inode, pid) &&
last_io.fio.rw == fio->rw &&
- last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+ last_io.fio.new_blkaddr + last_io.len ==
+ fio->new_blkaddr) {
last_io.len++;
return;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 10f1e784fa23..06a72dc0191a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -300,7 +300,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
} else {
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -308,7 +308,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(page);
}
inline_addr = inline_xattr_addr(page);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
}
memcpy(inline_addr, txattr_addr, inline_size);
f2fs_put_page(page, 1);
@@ -329,7 +329,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(xpage);
}
f2fs_bug_on(sbi, new_nid);
- f2fs_wait_on_page_writeback(xpage, NODE);
+ f2fs_wait_on_page_writeback(xpage, NODE, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 79dccc8252dd..f990de20cdcd 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int index,
- const char *name, const void *value, size_t size, int flags)
+ const char *name, const void *value, size_t size,
+ struct page *page, int flags)
{
return -EOPNOTSUPP;
}
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 182f9ffe2b51..3ff1772f612e 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET
that most of your FAT filesystems use, and can be overridden
with the "iocharset" mount option for FAT filesystems.
Note that "utf8" is not recommended for FAT filesystems.
- If unsure, you shouldn't set "utf8" here.
+ If unsure, you shouldn't set "utf8" here - select the next option
+ instead if you would like to use UTF-8 encoded file names by default.
See <file:Documentation/filesystems/vfat.txt> for more information.
Enable any character sets you need in File Systems/Native Language
Support.
+
+config FAT_DEFAULT_UTF8
+ bool "Enable FAT UTF-8 option by default"
+ depends on VFAT_FS
+ default n
+ help
+ Set this if you would like to have "utf8" mount option set
+ by default when mounting FAT filesystems.
+
+ Even if you say Y here can always disable UTF-8 for
+ particular mount by adding "utf8=0" to mount options.
+
+ Say Y if you use UTF-8 encoding for file names, N otherwise.
+
+ See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a5599052116c..226281068a46 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1127,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
}
opts->name_check = 'n';
opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
- opts->utf8 = opts->unicode_xlate = 0;
+ opts->unicode_xlate = 0;
opts->numtail = 1;
opts->usefree = opts->nocase = 0;
opts->tz_set = 0;
@@ -1135,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
opts->errors = FAT_ERRORS_RO;
*debug = 0;
+ opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+
if (!options)
goto out;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712dfa3e7..ca3c3dd01789 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd,
path_put(&path);
return fd;
}
- file = file_open_root(path.dentry, path.mnt, "", open_flag);
+ file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
if (IS_ERR(file)) {
put_unused_fd(fd);
retval = PTR_ERR(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f76d8950a57..fee81e8768c9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -278,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
wb_get(wb);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
- wb_put(wb); /* not gonna deref it anymore */
/* i_wb may have changed inbetween, can't use inode_to_wb() */
- if (likely(wb == inode->i_wb))
- return wb; /* @inode already has ref */
+ if (likely(wb == inode->i_wb)) {
+ wb_put(wb); /* @inode already has ref */
+ return wb;
+ }
spin_unlock(&wb->list_lock);
+ wb_put(wb);
cpu_relax();
spin_lock(&inode->i_lock);
}
@@ -317,7 +322,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
- struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +428,9 @@ skip_switch:
wb_put(new_wb);
iput(inode);
- deactivate_super(sb);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +440,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -471,20 +476,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
-
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- inode_to_wb(inode) == isw->new_wb)
- goto out_unlock;
-
- if (!atomic_inc_not_zero(&inode->i_sb->s_active))
- goto out_unlock;
-
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ inode_to_wb(inode) == isw->new_wb) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
inode->i_state |= I_WB_SWITCH;
spin_unlock(&inode->i_lock);
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -494,8 +499,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
return;
-out_unlock:
- spin_unlock(&inode->i_lock);
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
@@ -847,6 +850,33 @@ restart:
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
@@ -1309,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
* and does more profound writeback list handling in writeback_sb_inodes().
*/
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
- struct writeback_control *wbc)
+static int writeback_single_inode(struct inode *inode,
+ struct writeback_control *wbc)
{
+ struct bdi_writeback *wb;
int ret = 0;
spin_lock(&inode->i_lock);
@@ -1350,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
ret = __writeback_single_inode(inode, wbc);
wbc_detach_inode(wbc);
- spin_lock(&wb->list_lock);
+
+ wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
* If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1425,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
+ struct bdi_writeback *tmp_wb;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1515,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
cond_resched();
}
-
- spin_lock(&wb->list_lock);
+ /*
+ * Requeue @inode if still dirty. Be careful as @inode may
+ * have been switched to another wb in the meantime.
+ */
+ tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
- requeue_inode(inode, wb, &wbc);
+ requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
+ if (unlikely(tmp_wb != wb)) {
+ spin_unlock(&tmp_wb->list_lock);
+ spin_lock(&wb->list_lock);
+ }
+
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
@@ -2310,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
*/
int write_inode_now(struct inode *inode, int sync)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2322,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;
might_sleep();
- return writeback_single_inode(inode, wb, &wbc);
+ return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);
@@ -2339,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now);
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
- return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+ return writeback_single_inode(inode, wbc);
}
EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 8e3ee1936c7e..c5b6b7165489 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
{
- struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
loff_t pos = 0;
return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
@@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
{
- struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
loff_t pos = 0;
/*
* No locking or generic_write_checks(), the server is
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b03d253ece15..9dde38f12c07 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -528,6 +528,11 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
+static void fuse_io_release(struct kref *kref)
+{
+ kfree(container_of(kref, struct fuse_io_priv, refcnt));
+}
+
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
if (io->err)
@@ -585,8 +590,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
}
io->iocb->ki_complete(io->iocb, res, 0);
- kfree(io);
}
+
+ kref_put(&io->refcnt, fuse_io_release);
}
static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
@@ -613,6 +619,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
size_t num_bytes, struct fuse_io_priv *io)
{
spin_lock(&io->lock);
+ kref_get(&io->refcnt);
io->size += num_bytes;
io->reqs++;
spin_unlock(&io->lock);
@@ -691,7 +698,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
static int fuse_do_readpage(struct file *file, struct page *page)
{
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
@@ -984,7 +991,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
size_t res;
unsigned offset;
unsigned i;
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
for (i = 0; i < req->num_pages; i++)
fuse_wait_on_page_writeback(inode, req->pages[i]->index);
@@ -1240,6 +1247,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t *nbytesp, int write)
{
size_t nbytes = 0; /* # bytes already packed in req */
+ ssize_t ret = 0;
/* Special case for kernel I/O: can copy directly into the buffer */
if (ii->type & ITER_KVEC) {
@@ -1259,13 +1267,12 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
unsigned npages;
size_t start;
- ssize_t ret = iov_iter_get_pages(ii,
- &req->pages[req->num_pages],
+ ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
*nbytesp - nbytes,
req->max_pages - req->num_pages,
&start);
if (ret < 0)
- return ret;
+ break;
iov_iter_advance(ii, ret);
nbytes += ret;
@@ -1288,7 +1295,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
*nbytesp = nbytes;
- return 0;
+ return ret;
}
static inline int fuse_iter_npages(const struct iov_iter *ii_p)
@@ -1312,6 +1319,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
ssize_t res = 0;
struct fuse_req *req;
+ int err = 0;
if (io->async)
req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
@@ -1332,11 +1340,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
size_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
- int err = fuse_get_user_pages(req, iter, &nbytes, write);
- if (err) {
- res = err;
+ err = fuse_get_user_pages(req, iter, &nbytes, write);
+ if (err && !nbytes)
break;
- }
if (write)
nres = fuse_send_write(req, io, pos, nbytes, owner);
@@ -1346,11 +1352,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!io->async)
fuse_release_user_pages(req, !write);
if (req->out.h.error) {
- if (!res)
- res = req->out.h.error;
+ err = req->out.h.error;
break;
} else if (nres > nbytes) {
- res = -EIO;
+ res = 0;
+ err = -EIO;
break;
}
count -= nres;
@@ -1374,7 +1380,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (res > 0)
*ppos = pos;
- return res;
+ return res > 0 ? res : err;
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
@@ -1398,7 +1404,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
return __fuse_direct_read(&io, to, &iocb->ki_pos);
}
@@ -1406,7 +1412,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
ssize_t res;
if (is_bad_inode(inode))
@@ -2843,6 +2849,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
loff_t i_size;
size_t count = iov_iter_count(iter);
struct fuse_io_priv *io;
+ bool is_sync = is_sync_kiocb(iocb);
pos = offset;
inode = file->f_mapping->host;
@@ -2863,6 +2870,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
if (!io)
return -ENOMEM;
spin_lock_init(&io->lock);
+ kref_init(&io->refcnt);
io->reqs = 1;
io->bytes = -1;
io->size = 0;
@@ -2882,12 +2890,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
* to wait on real async I/O requests, so we must submit this request
* synchronously.
*/
- if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+ if (!is_sync && (offset + count > i_size) &&
iov_iter_rw(iter) == WRITE)
io->async = false;
- if (io->async && is_sync_kiocb(iocb))
+ if (io->async && is_sync) {
+ /*
+ * Additional reference to keep io around after
+ * calling fuse_aio_complete()
+ */
+ kref_get(&io->refcnt);
io->done = &wait;
+ }
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
@@ -2900,14 +2914,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (!is_sync_kiocb(iocb))
+ if (!is_sync)
return -EIOCBQUEUED;
wait_for_completion(&wait);
ret = fuse_get_res_by_io(io);
}
- kfree(io);
+ kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
if (ret > 0)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ce394b5fe6b4..eddbe02c4028 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
#include <linux/rbtree.h>
#include <linux/poll.h>
#include <linux/workqueue.h>
+#include <linux/kref.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -243,6 +244,7 @@ struct fuse_args {
/** The request IO state (for asynchronous processing) */
struct fuse_io_priv {
+ struct kref refcnt;
int async;
spinlock_t lock;
unsigned reqs;
@@ -256,6 +258,13 @@ struct fuse_io_priv {
struct completion *done;
};
+#define FUSE_IO_PRIV_SYNC(f) \
+{ \
+ .refcnt = { ATOMIC_INIT(1) }, \
+ .async = 0, \
+ .file = f, \
+}
+
/**
* Request flags
*
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 93f07465e5a6..aa016e4b8bec 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* the first place, mapping->nr_pages will always be zero.
*/
if (mapping->nrpages) {
- loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+ loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1);
loff_t len = iov_iter_count(iter);
loff_t end = PAGE_ALIGN(offset + len) - 1;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6a92592304fb..4a01f30e9995 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -798,7 +798,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
int error;
error = get_leaf_nr(dip, index, &leaf_no);
- if (!error)
+ if (!IS_ERR_VALUE(error))
error = get_leaf(dip, leaf_no, bh_out);
return error;
@@ -1014,7 +1014,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
index = name->hash >> (32 - dip->i_depth);
error = get_leaf_nr(dip, index, &leaf_no);
- if (error)
+ if (IS_ERR_VALUE(error))
return error;
/* Get the old leaf block */
@@ -1660,7 +1660,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
brelse(bh);
if (fail_on_exist)
return ERR_PTR(-EEXIST);
- inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino);
if (!IS_ERR(inode))
GFS2_I(inode)->i_rahead = rahead;
return inode;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 5d15e9498b48..d5bda8513457 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -137,7 +137,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
struct gfs2_sbd *sdp = sb->s_fs_info;
struct inode *inode;
- inode = gfs2_ilookup(sb, inum->no_addr, 0);
+ inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
iput(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a4ff7b56f5cd..6539131c52a2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -572,17 +572,24 @@ static void delete_work_func(struct work_struct *work)
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
+ /* If someone's using this glock to create a new dinode, the block must
+ have been freed by another node, then re-used, in which case our
+ iopen callback is too late after the fact. Ignore it. */
+ if (test_bit(GLF_INODE_CREATING, &gl->gl_flags))
+ goto out;
+
ip = gl->gl_object;
/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
if (ip)
- inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
+ inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
else
inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
if (inode && !IS_ERR(inode)) {
d_prune_aliases(inode);
iput(inode);
}
+out:
gfs2_glock_put(gl);
}
@@ -1015,6 +1022,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 845fb09cc606..a6a3389a07fc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -328,6 +328,7 @@ enum {
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
+ GLF_INODE_CREATING = 16, /* Inode creation occurring */
};
struct gfs2_glock {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 352f958769e1..bb30f9a72c65 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -37,61 +37,9 @@
#include "super.h"
#include "glops.h"
-struct gfs2_skip_data {
- u64 no_addr;
- int skipped;
- int non_block;
-};
-
-static int iget_test(struct inode *inode, void *opaque)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_skip_data *data = opaque;
-
- if (ip->i_no_addr == data->no_addr) {
- if (data->non_block &&
- inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
- data->skipped = 1;
- return 0;
- }
- return 1;
- }
- return 0;
-}
-
-static int iget_set(struct inode *inode, void *opaque)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_skip_data *data = opaque;
-
- if (data->skipped)
- return -ENOENT;
- inode->i_ino = (unsigned long)(data->no_addr);
- ip->i_no_addr = data->no_addr;
- return 0;
-}
-
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
{
- unsigned long hash = (unsigned long)no_addr;
- struct gfs2_skip_data data;
-
- data.no_addr = no_addr;
- data.skipped = 0;
- data.non_block = non_block;
- return ilookup5(sb, hash, iget_test, &data);
-}
-
-static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
- int non_block)
-{
- struct gfs2_skip_data data;
- unsigned long hash = (unsigned long)no_addr;
-
- data.no_addr = no_addr;
- data.skipped = 0;
- data.non_block = non_block;
- return iget5_locked(sb, hash, iget_test, iget_set, &data);
+ return ilookup(sb, (unsigned long)no_addr);
}
/**
@@ -132,21 +80,21 @@ static void gfs2_set_iop(struct inode *inode)
* @sb: The super block
* @no_addr: The inode number
* @type: The type of the inode
- * non_block: Can we block on inodes that are being freed?
*
* Returns: A VFS inode, or an error
*/
struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
- u64 no_addr, u64 no_formal_ino, int non_block)
+ u64 no_addr, u64 no_formal_ino)
{
struct inode *inode;
struct gfs2_inode *ip;
struct gfs2_glock *io_gl = NULL;
int error;
- inode = gfs2_iget(sb, no_addr, non_block);
+ inode = iget_locked(sb, (unsigned long)no_addr);
ip = GFS2_I(inode);
+ ip->i_no_addr = no_addr;
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -221,7 +169,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
if (error)
goto fail;
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
if (IS_ERR(inode))
goto fail;
@@ -592,7 +540,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct gfs2_glock *io_gl;
+ struct gfs2_glock *io_gl = NULL;
int error, free_vfs_inode = 1;
u32 aflags = 0;
unsigned blocks = 1;
@@ -729,6 +677,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock2;
+ BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags));
+
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
@@ -771,12 +721,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
}
gfs2_glock_dq_uninit(ghs);
gfs2_glock_dq_uninit(ghs + 1);
+ clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
return error;
fail_gunlock3:
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
gfs2_glock_put(io_gl);
fail_gunlock2:
+ if (io_gl)
+ clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
gfs2_glock_dq_uninit(ghs + 1);
fail_free_inode:
if (ip->i_gl)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index ba4d9492d422..e1af0d4aa308 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -94,12 +94,11 @@ err:
}
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
- u64 no_addr, u64 no_formal_ino,
- int non_block);
+ u64 no_addr, u64 no_formal_ino);
extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
u64 *no_formal_ino,
unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index dbed9e243ea2..49b0bff18fe3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -454,7 +454,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
struct dentry *dentry;
struct inode *inode;
- inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
if (IS_ERR(inode)) {
fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
return PTR_ERR(inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 8f960a51a9a0..f8a0cd821290 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1551,12 +1551,16 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_truncate;
}
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
- error = gfs2_glock_nq(&ip->i_iopen_gh);
- if (error)
- goto out_truncate;
+ if (ip->i_iopen_gh.gh_gl &&
+ test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE,
+ &ip->i_iopen_gh);
+ error = gfs2_glock_nq(&ip->i_iopen_gh);
+ if (error)
+ goto out_truncate;
+ }
/* Case 1 starts here */
@@ -1606,11 +1610,13 @@ out_unlock:
if (gfs2_rs_active(&ip->i_res))
gfs2_rs_deltree(&ip->i_res);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ if (ip->i_iopen_gh.gh_gl) {
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ }
+ gfs2_holder_uninit(&ip->i_iopen_gh);
}
- gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
if (error && error != GLR_TRYFAILED && error != -EROFS)
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 36345fefa3ff..517f2de784cf 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- bh = jbd2_journal_get_descriptor_buffer(journal);
+ bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
+ JBD2_COMMIT_BLOCK);
if (!bh)
return 1;
tmp = (struct commit_header *)bh->b_data;
- tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
mapping = jinode->i_vfs_inode->i_mapping;
- set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
/*
* submit the inode data buffers. We use writepage
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err;
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
- clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_atomic();
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ smp_mb();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
if (err) {
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
ret = err;
}
spin_lock(&journal->j_list_lock);
- clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_atomic();
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ smp_mb();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
-static void jbd2_descr_block_csum_set(journal_t *j,
- struct buffer_head *bh)
-{
- struct jbd2_journal_block_tail *tail;
- __u32 csum;
-
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
- sizeof(struct jbd2_journal_block_tail));
- tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
- tail->t_checksum = cpu_to_be32(csum);
-}
-
static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
ktime_t start_time;
u64 commit_time;
char *tagp = NULL;
- journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_abort(journal, err);
blk_start_plug(&plug);
- jbd2_journal_write_revoke_records(journal, commit_transaction,
- &log_bufs, WRITE_SYNC);
+ jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
jbd_debug(3, "JBD2: commit phase 2b\n");
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(4, "JBD2: get descriptor\n");
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ descriptor = jbd2_journal_get_descriptor_buffer(
+ commit_transaction,
+ JBD2_DESCRIPTOR_BLOCK);
if (!descriptor) {
jbd2_journal_abort(journal, -EIO);
continue;
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
(unsigned long long)descriptor->b_blocknr,
descriptor->b_data);
- header = (journal_header_t *)descriptor->b_data;
- header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
- header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
tagp = &descriptor->b_data[sizeof(journal_header_t)];
space_left = descriptor->b_size -
sizeof(journal_header_t);
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
- jbd2_descr_block_csum_set(journal, descriptor);
+ jbd2_descriptor_block_csum_set(journal, descriptor);
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81e622681c82..de73a9516a54 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *
+jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
{
+ journal_t *journal = transaction->t_journal;
struct buffer_head *bh;
unsigned long long blocknr;
+ journal_header_t *header;
int err;
err = jbd2_journal_next_log_block(journal, &blocknr);
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
return NULL;
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
+ header = (journal_header_t *)bh->b_data;
+ header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(type);
+ header->h_sequence = cpu_to_be32(transaction->t_tid);
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
return bh;
}
+void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
+{
+ struct jbd2_journal_block_tail *tail;
+ __u32 csum;
+
+ if (!jbd2_journal_has_csum_v2or3(j))
+ return;
+
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
+ sizeof(struct jbd2_journal_block_tail));
+ tail->t_checksum = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ tail->t_checksum = cpu_to_be32(csum);
+}
+
/*
* Return tid of the oldest transaction in the journal and block in the journal
* where the transaction starts.
@@ -1408,11 +1430,12 @@ out:
/**
* jbd2_mark_journal_empty() - Mark on disk journal as empty.
* @journal: The journal to update.
+ * @write_op: With which operation should we write the journal sb
*
* Update a journal's dynamic superblock fields to show that journal is empty.
* Write updated superblock to disk waiting for IO to complete.
*/
-static void jbd2_mark_journal_empty(journal_t *journal)
+static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
sb->s_start = cpu_to_be32(0);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_FUA);
+ jbd2_write_superblock(journal, write_op);
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal);
+
+ write_lock(&journal->j_state_lock);
+ journal->j_tail_sequence =
+ ++journal->j_transaction_sequence;
+ write_unlock(&journal->j_state_lock);
+
+ jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal);
+ jbd2_mark_journal_empty(journal, WRITE_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal);
+ jbd2_mark_journal_empty(journal, WRITE_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
restart:
spin_lock(&journal->j_list_lock);
/* Is commit writing out inode - we have to wait */
- if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 7f277e49fe88..08a456b96e4e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return 0;
}
-static int jbd2_descr_block_csum_verify(journal_t *j,
- void *buf)
+static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
{
struct jbd2_journal_block_tail *tail;
__be32 provided;
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal,
descr_csum_size =
sizeof(struct jbd2_journal_block_tail);
if (descr_csum_size > 0 &&
- !jbd2_descr_block_csum_verify(journal,
- bh->b_data)) {
+ !jbd2_descriptor_block_csum_verify(journal,
+ bh->b_data)) {
printk(KERN_ERR "JBD2: Invalid checksum "
"recovering block %lu in log\n",
next_log_block);
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal,
return err;
}
-static int jbd2_revoke_block_csum_verify(journal_t *j,
- void *buf)
-{
- struct jbd2_journal_revoke_tail *tail;
- __be32 provided;
- __u32 calculated;
-
- if (!jbd2_journal_has_csum_v2or3(j))
- return 1;
-
- tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
- sizeof(struct jbd2_journal_revoke_tail));
- provided = tail->r_checksum;
- tail->r_checksum = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
- tail->r_checksum = provided;
-
- return provided == cpu_to_be32(calculated);
-}
-
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
offset = sizeof(jbd2_journal_revoke_header_t);
rcount = be32_to_cpu(header->r_count);
- if (!jbd2_revoke_block_csum_verify(journal, header))
+ if (!jbd2_descriptor_block_csum_verify(journal, header))
return -EFSBADCRC;
if (jbd2_journal_has_csum_v2or3(journal))
- csum_size = sizeof(struct jbd2_journal_revoke_tail);
+ csum_size = sizeof(struct jbd2_journal_block_tail);
if (rcount > journal->j_blocksize - csum_size)
return -EINVAL;
max = rcount;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 705ae577882b..91171dc352cb 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
-static void write_one_revoke_record(journal_t *, transaction_t *,
+static void write_one_revoke_record(transaction_t *,
struct list_head *,
struct buffer_head **, int *,
- struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
+ struct jbd2_revoke_record_s *);
+static void flush_descriptor(journal_t *, struct buffer_head *, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*/
-void jbd2_journal_write_revoke_records(journal_t *journal,
- transaction_t *transaction,
- struct list_head *log_bufs,
- int write_op)
+void jbd2_journal_write_revoke_records(transaction_t *transaction,
+ struct list_head *log_bufs)
{
+ journal_t *journal = transaction->t_journal;
struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction, log_bufs,
- &descriptor, &offset,
- record, write_op);
+ write_one_revoke_record(transaction, log_bufs,
+ &descriptor, &offset, record);
count++;
list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
if (descriptor)
- flush_descriptor(journal, descriptor, offset, write_op);
+ flush_descriptor(journal, descriptor, offset);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
* block if the old one is full or if we have not already created one.
*/
-static void write_one_revoke_record(journal_t *journal,
- transaction_t *transaction,
+static void write_one_revoke_record(transaction_t *transaction,
struct list_head *log_bufs,
struct buffer_head **descriptorp,
int *offsetp,
- struct jbd2_revoke_record_s *record,
- int write_op)
+ struct jbd2_revoke_record_s *record)
{
+ journal_t *journal = transaction->t_journal;
int csum_size = 0;
struct buffer_head *descriptor;
int sz, offset;
- journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal,
/* Do we need to leave space at the end for a checksum? */
if (jbd2_journal_has_csum_v2or3(journal))
- csum_size = sizeof(struct jbd2_journal_revoke_tail);
+ csum_size = sizeof(struct jbd2_journal_block_tail);
if (jbd2_has_feature_64bit(journal))
sz = 8;
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal,
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset + sz > journal->j_blocksize - csum_size) {
- flush_descriptor(journal, descriptor, offset, write_op);
+ flush_descriptor(journal, descriptor, offset);
descriptor = NULL;
}
}
if (!descriptor) {
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ descriptor = jbd2_journal_get_descriptor_buffer(transaction,
+ JBD2_REVOKE_BLOCK);
if (!descriptor)
return;
- header = (journal_header_t *)descriptor->b_data;
- header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
- header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
BUFFER_TRACE(descriptor, "file in log_bufs");
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
-static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
-{
- struct jbd2_journal_revoke_tail *tail;
- __u32 csum;
-
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
- sizeof(struct jbd2_journal_revoke_tail));
- tail->r_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
- tail->r_checksum = cpu_to_be32(csum);
-}
-
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
static void flush_descriptor(journal_t *journal,
struct buffer_head *descriptor,
- int offset, int write_op)
+ int offset)
{
jbd2_journal_revoke_header_t *header;
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal,
header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
- jbd2_revoke_csum_set(journal, descriptor);
+ jbd2_descriptor_block_csum_set(journal, descriptor);
set_buffer_jwrite(descriptor);
BUFFER_TRACE(descriptor, "write");
set_buffer_dirty(descriptor);
- write_dirty_buffer(descriptor, write_op);
+ write_dirty_buffer(descriptor, WRITE_SYNC);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 081dff087fc0..01e4652d88f6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -966,14 +966,8 @@ repeat:
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
- frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
- if (!frozen_buffer) {
- printk(KERN_ERR "%s: OOM for frozen_buffer\n",
- __func__);
- JBUFFER_TRACE(jh, "oom!");
- error = -ENOMEM;
- goto out;
- }
+ frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS | __GFP_NOFAIL);
goto repeat;
}
jh->b_frozen_data = frozen_buffer;
@@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
goto out;
repeat:
- if (!jh->b_committed_data) {
- committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
- if (!committed_data) {
- printk(KERN_ERR "%s: No memory for committed data\n",
- __func__);
- err = -ENOMEM;
- goto out;
- }
- }
+ if (!jh->b_committed_data)
+ committed_data = jbd2_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS|__GFP_NOFAIL);
jbd_lock_bh_state(bh);
if (!jh->b_committed_data) {
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index 3ea36554107f..8918ac905a3b 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -2,10 +2,6 @@
JFFS2 LOCKING DOCUMENTATION
---------------------------
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
This document attempts to describe the existing locking rules for
JFFS2. It is not expected to remain perfectly up to date, but ought to
be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
any f->sem held.
2. Never attempt to lock two file mutexes in one thread.
No ordering rules have been made for doing so.
+ 3. Never lock a page cache page with f->sem held.
erase_completion_lock spinlock
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 0ae91ad6df2d..b288c8ae1236 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -50,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
- struct jffs2_inode_cache *ic)
+ struct jffs2_inode_cache *ic,
+ int *dir_hardlinks)
{
struct jffs2_full_dirent *fd;
@@ -69,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
fd->name, fd->ino, ic->ino);
jffs2_mark_node_obsolete(c, fd->raw);
+ /* Clear the ic/raw union so it doesn't cause problems later. */
+ fd->ic = NULL;
continue;
}
+ /* From this point, fd->raw is no longer used so we can set fd->ic */
+ fd->ic = child_ic;
+ child_ic->pino_nlink++;
+ /* If we appear (at this stage) to have hard-linked directories,
+ * set a flag to trigger a scan later */
if (fd->type == DT_DIR) {
- if (child_ic->pino_nlink) {
- JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
- fd->name, fd->ino, ic->ino);
- /* TODO: What do we do about it? */
- } else {
- child_ic->pino_nlink = ic->ino;
- }
- } else
- child_ic->pino_nlink++;
+ child_ic->flags |= INO_FLAGS_IS_DIR;
+ if (child_ic->pino_nlink > 1)
+ *dir_hardlinks = 1;
+ }
dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
/* Can't free scan_dents so far. We might need them in pass 2 */
@@ -95,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
*/
static int jffs2_build_filesystem(struct jffs2_sb_info *c)
{
- int ret;
- int i;
+ int ret, i, dir_hardlinks = 0;
struct jffs2_inode_cache *ic;
struct jffs2_full_dirent *fd;
struct jffs2_full_dirent *dead_fds = NULL;
@@ -120,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
/* Now scan the directory tree, increasing nlink according to every dirent found. */
for_each_inode(i, c, ic) {
if (ic->scan_dents) {
- jffs2_build_inode_pass1(c, ic);
+ jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
cond_resched();
}
}
@@ -156,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
}
dbg_fsbuild("pass 2a complete\n");
+
+ if (dir_hardlinks) {
+ /* If we detected directory hardlinks earlier, *hopefully*
+ * they are gone now because some of the links were from
+ * dead directories which still had some old dirents lying
+ * around and not yet garbage-collected, but which have
+ * been discarded above. So clear the pino_nlink field
+ * in each directory, so that the final scan below can
+ * print appropriate warnings. */
+ for_each_inode(i, c, ic) {
+ if (ic->flags & INO_FLAGS_IS_DIR)
+ ic->pino_nlink = 0;
+ }
+ }
dbg_fsbuild("freeing temporary data structures\n");
/* Finally, we can scan again and free the dirent structs */
@@ -163,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
while(ic->scan_dents) {
fd = ic->scan_dents;
ic->scan_dents = fd->next;
+ /* We do use the pino_nlink field to count nlink of
+ * directories during fs build, so set it to the
+ * parent ino# now. Now that there's hopefully only
+ * one. */
+ if (fd->type == DT_DIR) {
+ if (!fd->ic) {
+ /* We'll have complained about it and marked the coresponding
+ raw node obsolete already. Just skip it. */
+ continue;
+ }
+
+ /* We *have* to have set this in jffs2_build_inode_pass1() */
+ BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+ /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks
+ * is set. Otherwise, we know this should never trigger anyway, so
+ * we don't do the check. And ic->pino_nlink still contains the nlink
+ * value (which is 1). */
+ if (dir_hardlinks && fd->ic->pino_nlink) {
+ JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+ fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+ /* Should we unlink it from its previous parent? */
+ }
+
+ /* For directories, ic->pino_nlink holds that parent inode # */
+ fd->ic->pino_nlink = ic->ino;
+ }
jffs2_free_full_dirent(fd);
}
ic->scan_dents = NULL;
@@ -241,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
/* Reduce nlink of the child. If it's now zero, stick it on the
dead_fds list to be cleaned up later. Else just free the fd */
-
- if (fd->type == DT_DIR)
- child_ic->pino_nlink = 0;
- else
- child_ic->pino_nlink--;
+ child_ic->pino_nlink--;
if (!child_ic->pino_nlink) {
dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e18566..30c4c9ebb693 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c5ac5944bc1b..cad86bac3453 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- struct jffs2_raw_inode ri;
- uint32_t alloc_len = 0;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
- jffs2_dbg(1, "%s()\n", __func__);
-
- if (pageofs > inode->i_size) {
- ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
- ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
- if (ret)
- return ret;
- }
-
- mutex_lock(&f->sem);
pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg) {
- if (alloc_len)
- jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
+ if (!pg)
return -ENOMEM;
- }
*pagep = pg;
- if (alloc_len) {
+ jffs2_dbg(1, "%s()\n", __func__);
+
+ if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+ struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
+ uint32_t alloc_len;
jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
(unsigned int)inode->i_size, pageofs);
+ ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+ ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+ if (ret)
+ goto out_page;
+
+ mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (IS_ERR(fn)) {
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
+ mutex_unlock(&f->sem);
}
/*
@@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* case of a short-copy.
*/
if (!PageUptodate(pg)) {
+ mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
+ mutex_unlock(&f->sem);
if (ret)
goto out_page;
}
- mutex_unlock(&f->sem);
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
unlock_page(pg);
page_cache_release(pg);
- mutex_unlock(&f->sem);
return ret;
}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5a2dec2b064c..7e553f286775 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -134,37 +134,59 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
if (mutex_lock_interruptible(&c->alloc_sem))
return -EINTR;
+
for (;;) {
+ /* We can't start doing GC until we've finished checking
+ the node CRCs etc. */
+ int bucket, want_ino;
+
spin_lock(&c->erase_completion_lock);
if (!c->unchecked_size)
break;
-
- /* We can't start doing GC yet. We haven't finished checking
- the node CRCs etc. Do it now. */
-
- /* checked_ino is protected by the alloc_sem */
- if (c->checked_ino > c->highest_ino && xattr) {
- pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
- c->unchecked_size);
- jffs2_dbg_dump_block_lists_nolock(c);
- spin_unlock(&c->erase_completion_lock);
- mutex_unlock(&c->alloc_sem);
- return -ENOSPC;
- }
-
spin_unlock(&c->erase_completion_lock);
if (!xattr)
xattr = jffs2_verify_xattr(c);
spin_lock(&c->inocache_lock);
+ /* Instead of doing the inodes in numeric order, doing a lookup
+ * in the hash for each possible number, just walk the hash
+ * buckets of *existing* inodes. This means that we process
+ * them out-of-order, but it can be a lot faster if there's
+ * a sparse inode# space. Which there often is. */
+ want_ino = c->check_ino;
+ for (bucket = c->check_ino % c->inocache_hashsize ; bucket < c->inocache_hashsize; bucket++) {
+ for (ic = c->inocache_list[bucket]; ic; ic = ic->next) {
+ if (ic->ino < want_ino)
+ continue;
+
+ if (ic->state != INO_STATE_CHECKEDABSENT &&
+ ic->state != INO_STATE_PRESENT)
+ goto got_next; /* with inocache_lock held */
+
+ jffs2_dbg(1, "Skipping ino #%u already checked\n",
+ ic->ino);
+ }
+ want_ino = 0;
+ }
- ic = jffs2_get_ino_cache(c, c->checked_ino++);
+ /* Point c->check_ino past the end of the last bucket. */
+ c->check_ino = ((c->highest_ino + c->inocache_hashsize + 1) &
+ ~c->inocache_hashsize) - 1;
- if (!ic) {
- spin_unlock(&c->inocache_lock);
- continue;
- }
+ spin_unlock(&c->inocache_lock);
+
+ pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
+ c->unchecked_size);
+ jffs2_dbg_dump_block_lists_nolock(c);
+ mutex_unlock(&c->alloc_sem);
+ return -ENOSPC;
+
+ got_next:
+ /* For next time round the loop, we want c->checked_ino to indicate
+ * the *next* one we want to check. And since we're walking the
+ * buckets rather than doing it sequentially, it's: */
+ c->check_ino = ic->ino + c->inocache_hashsize;
if (!ic->pino_nlink) {
jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
@@ -176,8 +198,6 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
switch(ic->state) {
case INO_STATE_CHECKEDABSENT:
case INO_STATE_PRESENT:
- jffs2_dbg(1, "Skipping ino #%u already checked\n",
- ic->ino);
spin_unlock(&c->inocache_lock);
continue;
@@ -196,7 +216,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
ic->ino);
/* We need to come back again for the _same_ inode. We've
made no progress in this case, but that should be OK */
- c->checked_ino--;
+ c->check_ino = ic->ino;
mutex_unlock(&c->alloc_sem);
sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
@@ -1296,14 +1316,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
BUG_ON(start > orig_start);
}
- /* First, use readpage() to read the appropriate page into the page cache */
- /* Q: What happens if we actually try to GC the _same_ page for which commit_write()
- * triggered garbage collection in the first place?
- * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
- * page OK. We'll actually write it out again in commit_write, which is a little
- * suboptimal, but at least we're correct.
- */
+ /* The rules state that we must obtain the page lock *before* f->sem, so
+ * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+ * actually going to *change* so we're safe; we only allow reading.
+ *
+ * It is important to note that jffs2_write_begin() will ensure that its
+ * page is marked Uptodate before allocating space. That means that if we
+ * end up here trying to GC the *same* page that jffs2_write_begin() is
+ * trying to write out, read_cache_page() will not deadlock. */
+ mutex_unlock(&f->sem);
pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+ mutex_lock(&f->sem);
if (IS_ERR(pg_ptr)) {
pr_warn("read_cache_page() returned error: %ld\n",
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 046fee8b6e9b..778275f48a87 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -49,7 +49,7 @@ struct jffs2_sb_info {
struct mtd_info *mtd;
uint32_t highest_ino;
- uint32_t checked_ino;
+ uint32_t check_ino; /* *NEXT* inode to be checked */
unsigned int flags;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index fa35ff79ab35..0637271f3770 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
#define INO_STATE_CLEARING 6 /* In clear_inode() */
#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR 0x02 /* is a directory */
#define RAWNODE_CLASS_INODE_CACHE 0
#define RAWNODE_CLASS_XATTR_DATUM 1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
struct jffs2_full_dirent
{
- struct jffs2_raw_node_ref *raw;
+ union {
+ struct jffs2_raw_node_ref *raw;
+ struct jffs2_inode_cache *ic; /* Just during part of build */
+ };
struct jffs2_full_dirent *next;
uint32_t version;
uint32_t ino; /* == zero for unlink */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index b6bd4affd9ad..cda0774c2c9c 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -846,8 +846,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
return 1;
if (c->unchecked_size) {
- jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
- c->unchecked_size, c->checked_ino);
+ jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, check_ino #%d\n",
+ c->unchecked_size, c->check_ino);
return 1;
}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5a3da3f52908..b25d28a21212 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1183,22 +1183,20 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
{
- struct nand_ecclayout *oinfo = c->mtd->ecclayout;
-
if (!c->mtd->oobsize)
return 0;
/* Cleanmarker is out-of-band, so inline size zero */
c->cleanmarker_size = 0;
- if (!oinfo || oinfo->oobavail == 0) {
+ if (c->mtd->oobavail == 0) {
pr_err("inconsistent device description\n");
return -EINVAL;
}
jffs2_dbg(1, "using OOB on NAND\n");
- c->oobavail = oinfo->oobavail;
+ c->oobavail = c->mtd->oobavail;
/* Initialise write buffer */
init_rwsem(&c->wbuf_sem);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 996b7742c90b..03b688d19f69 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,28 +44,122 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}
-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
- size_t buflen)
+/* kernfs_node_depth - compute depth from @from to @to */
+static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
- char *p = buf + buflen;
- int len;
+ size_t depth = 0;
- *--p = '\0';
+ while (to->parent && to != from) {
+ depth++;
+ to = to->parent;
+ }
+ return depth;
+}
- do {
- len = strlen(kn->name);
- if (p - buf < len + 1) {
- buf[0] = '\0';
- p = NULL;
- break;
- }
- p -= len;
- memcpy(p, kn->name, len);
- *--p = '/';
- kn = kn->parent;
- } while (kn && kn->parent);
+static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
+ struct kernfs_node *b)
+{
+ size_t da, db;
+ struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
+
+ if (ra != rb)
+ return NULL;
+
+ da = kernfs_depth(ra->kn, a);
+ db = kernfs_depth(rb->kn, b);
+
+ while (da > db) {
+ a = a->parent;
+ da--;
+ }
+ while (db > da) {
+ b = b->parent;
+ db--;
+ }
+
+ /* worst case b and a will be the same at root */
+ while (b != a) {
+ b = b->parent;
+ a = a->parent;
+ }
+
+ return a;
+}
+
+/**
+ * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
+ * where kn_from is treated as root of the path.
+ * @kn_from: kernfs node which should be treated as root for the path
+ * @kn_to: kernfs node to which path is needed
+ * @buf: buffer to copy the path into
+ * @buflen: size of @buf
+ *
+ * We need to handle couple of scenarios here:
+ * [1] when @kn_from is an ancestor of @kn_to at some level
+ * kn_from: /n1/n2/n3
+ * kn_to: /n1/n2/n3/n4/n5
+ * result: /n4/n5
+ *
+ * [2] when @kn_from is on a different hierarchy and we need to find common
+ * ancestor between @kn_from and @kn_to.
+ * kn_from: /n1/n2/n3/n4
+ * kn_to: /n1/n2/n5
+ * result: /../../n5
+ * OR
+ * kn_from: /n1/n2/n3/n4/n5 [depth=5]
+ * kn_to: /n1/n2/n3 [depth=3]
+ * result: /../..
+ *
+ * return value: length of the string. If greater than buflen,
+ * then contents of buf are undefined. On error, -1 is returned.
+ */
+static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
+ struct kernfs_node *kn_from,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn, *common;
+ const char parent_str[] = "/..";
+ size_t depth_from, depth_to, len = 0, nlen = 0;
+ char *p;
+ int i;
+
+ if (!kn_from)
+ kn_from = kernfs_root(kn_to)->kn;
+
+ if (kn_from == kn_to)
+ return strlcpy(buf, "/", buflen);
+
+ common = kernfs_common_ancestor(kn_from, kn_to);
+ if (WARN_ON(!common))
+ return -1;
+
+ depth_to = kernfs_depth(common, kn_to);
+ depth_from = kernfs_depth(common, kn_from);
+
+ if (buf)
+ buf[0] = '\0';
+
+ for (i = 0; i < depth_from; i++)
+ len += strlcpy(buf + len, parent_str,
+ len < buflen ? buflen - len : 0);
+
+ /* Calculate how many bytes we need for the rest */
+ for (kn = kn_to; kn != common; kn = kn->parent)
+ nlen += strlen(kn->name) + 1;
+
+ if (len + nlen >= buflen)
+ return len + nlen;
+
+ p = buf + len + nlen;
+ *p = '\0';
+ for (kn = kn_to; kn != common; kn = kn->parent) {
+ nlen = strlen(kn->name);
+ p -= nlen;
+ memcpy(p, kn->name, nlen);
+ *(--p) = '/';
+ }
- return p;
+ return len + nlen;
}
/**
@@ -115,6 +209,34 @@ size_t kernfs_path_len(struct kernfs_node *kn)
}
/**
+ * kernfs_path_from_node - build path of node @to relative to @from.
+ * @from: parent kernfs_node relative to which we need to build the path
+ * @to: kernfs_node of interest
+ * @buf: buffer to copy @to's path into
+ * @buflen: size of @buf
+ *
+ * Builds @to's path relative to @from in @buf. @from and @to must
+ * be on the same kernfs-root. If @from is not parent of @to, then a relative
+ * path (which includes '..'s) as needed to reach from @from to @to is
+ * returned.
+ *
+ * If @buf isn't long enough, the return value will be greater than @buflen
+ * and @buf contents are undefined.
+ */
+int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
+ char *buf, size_t buflen)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+ ret = kernfs_path_from_node_locked(to, from, buf, buflen);
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+
+/**
* kernfs_path - build full path of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
@@ -127,13 +249,12 @@ size_t kernfs_path_len(struct kernfs_node *kn)
*/
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
- unsigned long flags;
- char *p;
+ int ret;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, buf, buflen);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
- return p;
+ ret = kernfs_path_from_node(kn, NULL, buf, buflen);
+ if (ret < 0 || ret >= buflen)
+ return NULL;
+ return buf;
}
EXPORT_SYMBOL_GPL(kernfs_path);
@@ -164,17 +285,25 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
unsigned long flags;
- char *p;
+ int sz;
spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
- if (p)
- pr_cont("%s", p);
- else
- pr_cont("<name too long>");
+ sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
+ if (sz < 0) {
+ pr_cont("(error)");
+ goto out;
+ }
+
+ if (sz >= sizeof(kernfs_pr_cont_buf)) {
+ pr_cont("(name too long)");
+ goto out;
+ }
+ pr_cont("%s", kernfs_pr_cont_buf);
+
+out:
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}
@@ -691,15 +820,22 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
const unsigned char *path,
const void *ns)
{
- static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */
- size_t len = strlcpy(path_buf, path, PATH_MAX);
- char *p = path_buf;
- char *name;
+ size_t len;
+ char *p, *name;
lockdep_assert_held(&kernfs_mutex);
- if (len >= PATH_MAX)
+ /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
+ spin_lock_irq(&kernfs_rename_lock);
+
+ len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+
+ if (len >= sizeof(kernfs_pr_cont_buf)) {
+ spin_unlock_irq(&kernfs_rename_lock);
return NULL;
+ }
+
+ p = kernfs_pr_cont_buf;
while ((name = strsep(&p, "/")) && parent) {
if (*name == '\0')
@@ -707,6 +843,8 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
parent = kernfs_find_ns(parent, name, ns);
}
+ spin_unlock_irq(&kernfs_rename_lock);
+
return parent;
}
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 8eaf417187f1..b67dbccdaf88 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -14,6 +14,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/namei.h>
#include "kernfs-internal.h"
@@ -62,6 +63,74 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
return NULL;
}
+/*
+ * find the next ancestor in the path down to @child, where @parent was the
+ * ancestor whose descendant we want to find.
+ *
+ * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
+ * node. If @parent is b, then we return the node for c.
+ * Passing in d as @parent is not ok.
+ */
+static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
+ struct kernfs_node *parent)
+{
+ if (child == parent) {
+ pr_crit_once("BUG in find_next_ancestor: called with parent == child");
+ return NULL;
+ }
+
+ while (child->parent != parent) {
+ if (!child->parent)
+ return NULL;
+ child = child->parent;
+ }
+
+ return child;
+}
+
+/**
+ * kernfs_node_dentry - get a dentry for the given kernfs_node
+ * @kn: kernfs_node for which a dentry is needed
+ * @sb: the kernfs super_block
+ */
+struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
+ struct super_block *sb)
+{
+ struct dentry *dentry;
+ struct kernfs_node *knparent = NULL;
+
+ BUG_ON(sb->s_op != &kernfs_sops);
+
+ dentry = dget(sb->s_root);
+
+ /* Check if this is the root kernfs_node */
+ if (!kn->parent)
+ return dentry;
+
+ knparent = find_next_ancestor(kn, NULL);
+ if (WARN_ON(!knparent))
+ return ERR_PTR(-EINVAL);
+
+ do {
+ struct dentry *dtmp;
+ struct kernfs_node *kntmp;
+
+ if (kn == knparent)
+ return dentry;
+ kntmp = find_next_ancestor(kn, knparent);
+ if (WARN_ON(!kntmp))
+ return ERR_PTR(-EINVAL);
+ mutex_lock(&d_inode(dentry)->i_mutex);
+ dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
+ mutex_unlock(&d_inode(dentry)->i_mutex);
+ dput(dentry);
+ if (IS_ERR(dtmp))
+ return dtmp;
+ knparent = kntmp;
+ dentry = dtmp;
+ } while (true);
+}
+
static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
{
struct kernfs_super_info *info = kernfs_info(sb);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 187477ded6b3..eccda3a02de6 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
-/*
- * linux/fs/mbcache.c
- * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-/*
- * Filesystem Meta Information Block Cache (mbcache)
- *
- * The mbcache caches blocks of block devices that need to be located
- * by their device/block number, as well as by other criteria (such
- * as the block's contents).
- *
- * There can only be one cache entry in a cache per device and block number.
- * Additional indexes need not be unique in this sense. The number of
- * additional indexes (=other criteria) can be hardwired at compile time
- * or specified at cache create time.
- *
- * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
- * in the cache. A valid entry is in the main hash tables of the cache,
- * and may also be in the lru list. An invalid entry is not in any hashes
- * or lists.
- *
- * A valid cache entry is only in the lru list if no handles refer to it.
- * Invalid cache entries will be freed when the last handle to the cache
- * entry is released. Entries that cannot be freed immediately are put
- * back on the lru list.
- */
-
-/*
- * Lock descriptions and usage:
- *
- * Each hash chain of both the block and index hash tables now contains
- * a built-in lock used to serialize accesses to the hash chain.
- *
- * Accesses to global data structures mb_cache_list and mb_cache_lru_list
- * are serialized via the global spinlock mb_cache_spinlock.
- *
- * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
- * accesses to its local data, such as e_used and e_queued.
- *
- * Lock ordering:
- *
- * Each block hash chain's lock has the highest lock order, followed by an
- * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
- * lock), and mb_cach_spinlock, with the lowest order. While holding
- * either a block or index hash chain lock, a thread can acquire an
- * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
- *
- * Synchronization:
- *
- * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
- * index hash chian, it needs to lock the corresponding hash chain. For each
- * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
- * prevent either any simultaneous release or free on the entry and also
- * to serialize accesses to either the e_used or e_queued member of the entry.
- *
- * To avoid having a dangling reference to an already freed
- * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
- * block hash chain and also no longer being referenced, both e_used,
- * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
- * first removed from a block hash chain.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <linux/hash.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/list.h>
#include <linux/list_bl.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
#include <linux/mbcache.h>
-#include <linux/init.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/log2.h>
-
-#ifdef MB_CACHE_DEBUG
-# define mb_debug(f...) do { \
- printk(KERN_DEBUG f); \
- printk("\n"); \
- } while (0)
-#define mb_assert(c) do { if (!(c)) \
- printk(KERN_ERR "assertion " #c " failed\n"); \
- } while(0)
-#else
-# define mb_debug(f...) do { } while(0)
-# define mb_assert(c) do { } while(0)
-#endif
-#define mb_error(f...) do { \
- printk(KERN_ERR f); \
- printk("\n"); \
- } while(0)
-
-#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
-
-#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
-#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
- (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
-
-static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-static struct blockgroup_lock *mb_cache_bg_lock;
-static struct kmem_cache *mb_cache_kmem_cache;
-
-MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
-MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
-MODULE_LICENSE("GPL");
-
-EXPORT_SYMBOL(mb_cache_create);
-EXPORT_SYMBOL(mb_cache_shrink);
-EXPORT_SYMBOL(mb_cache_destroy);
-EXPORT_SYMBOL(mb_cache_entry_alloc);
-EXPORT_SYMBOL(mb_cache_entry_insert);
-EXPORT_SYMBOL(mb_cache_entry_release);
-EXPORT_SYMBOL(mb_cache_entry_free);
-EXPORT_SYMBOL(mb_cache_entry_get);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-EXPORT_SYMBOL(mb_cache_entry_find_first);
-EXPORT_SYMBOL(mb_cache_entry_find_next);
-#endif
/*
- * Global data: list of all mbcache's, lru list, and a spinlock for
- * accessing cache data structures on SMP machines. The lru list is
- * global across all mbcaches.
+ * Mbcache is a simple key-value store. Keys need not be unique, however
+ * key-value pairs are expected to be unique (we use this fact in
+ * mb_cache_entry_delete_block()).
+ *
+ * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
+ * They use hash of a block contents as a key and block number as a value.
+ * That's why keys need not be unique (different xattr blocks may end up having
+ * the same hash). However block number always uniquely identifies a cache
+ * entry.
+ *
+ * We provide functions for creation and removal of entries, search by key,
+ * and a special "delete entry with given key-value pair" operation. Fixed
+ * size hash table is used for fast key lookups.
*/
-static LIST_HEAD(mb_cache_list);
-static LIST_HEAD(mb_cache_lru_list);
-static DEFINE_SPINLOCK(mb_cache_spinlock);
-
-static inline void
-__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
-{
- spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
- MB_CACHE_ENTRY_LOCK_INDEX(ce)));
-}
-
-static inline void
-__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
-{
- spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
- MB_CACHE_ENTRY_LOCK_INDEX(ce)));
-}
-
-static inline int
-__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
-{
- return !hlist_bl_unhashed(&ce->e_block_list);
-}
+struct mb_cache {
+ /* Hash table of entries */
+ struct hlist_bl_head *c_hash;
+ /* log2 of hash table size */
+ int c_bucket_bits;
+ /* Maximum entries in cache to avoid degrading hash too much */
+ int c_max_entries;
+ /* Protects c_list, c_entry_count */
+ spinlock_t c_list_lock;
+ struct list_head c_list;
+ /* Number of entries in cache */
+ unsigned long c_entry_count;
+ struct shrinker c_shrink;
+ /* Work for shrinking when the cache has too many entries */
+ struct work_struct c_shrink_work;
+};
+static struct kmem_cache *mb_entry_cache;
-static inline void
-__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
-{
- if (__mb_cache_entry_is_block_hashed(ce))
- hlist_bl_del_init(&ce->e_block_list);
-}
+static unsigned long mb_cache_shrink(struct mb_cache *cache,
+ unsigned int nr_to_scan);
-static inline int
-__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
+ u32 key)
{
- return !hlist_bl_unhashed(&ce->e_index.o_list);
+ return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
}
-static inline void
-__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
-{
- if (__mb_cache_entry_is_index_hashed(ce))
- hlist_bl_del_init(&ce->e_index.o_list);
-}
+/*
+ * Number of entries to reclaim synchronously when there are too many entries
+ * in cache
+ */
+#define SYNC_SHRINK_BATCH 64
/*
- * __mb_cache_entry_unhash_unlock()
- *
- * This function is called to unhash both the block and index hash
- * chain.
- * It assumes both the block and index hash chain is locked upon entry.
- * It also unlock both hash chains both exit
+ * mb_cache_entry_create - create entry in cache
+ * @cache - cache where the entry should be created
+ * @mask - gfp mask with which the entry should be allocated
+ * @key - key of the entry
+ * @block - block that contains data
+ * @reusable - is the block reusable by other inodes?
+ *
+ * Creates entry in @cache with key @key and records that data is stored in
+ * block @block. The function returns -EBUSY if entry with the same key
+ * and for the same block already exists in cache. Otherwise 0 is returned.
*/
-static inline void
-__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
+ sector_t block, bool reusable)
{
- __mb_cache_entry_unhash_index(ce);
- hlist_bl_unlock(ce->e_index_hash_p);
- __mb_cache_entry_unhash_block(ce);
- hlist_bl_unlock(ce->e_block_hash_p);
+ struct mb_cache_entry *entry, *dup;
+ struct hlist_bl_node *dup_node;
+ struct hlist_bl_head *head;
+
+ /* Schedule background reclaim if there are too many entries */
+ if (cache->c_entry_count >= cache->c_max_entries)
+ schedule_work(&cache->c_shrink_work);
+ /* Do some sync reclaim if background reclaim cannot keep up */
+ if (cache->c_entry_count >= 2*cache->c_max_entries)
+ mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
+
+ entry = kmem_cache_alloc(mb_entry_cache, mask);
+ if (!entry)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&entry->e_list);
+ /* One ref for hash, one ref returned */
+ atomic_set(&entry->e_refcnt, 1);
+ entry->e_key = key;
+ entry->e_block = block;
+ entry->e_reusable = reusable;
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+ if (dup->e_key == key && dup->e_block == block) {
+ hlist_bl_unlock(head);
+ kmem_cache_free(mb_entry_cache, entry);
+ return -EBUSY;
+ }
+ }
+ hlist_bl_add_head(&entry->e_hash_list, head);
+ hlist_bl_unlock(head);
+
+ spin_lock(&cache->c_list_lock);
+ list_add_tail(&entry->e_list, &cache->c_list);
+ /* Grab ref for LRU list */
+ atomic_inc(&entry->e_refcnt);
+ cache->c_entry_count++;
+ spin_unlock(&cache->c_list_lock);
+
+ return 0;
}
+EXPORT_SYMBOL(mb_cache_entry_create);
-static void
-__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
+void __mb_cache_entry_free(struct mb_cache_entry *entry)
{
- struct mb_cache *cache = ce->e_cache;
-
- mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
- kmem_cache_free(cache->c_entry_cache, ce);
- atomic_dec(&cache->c_entry_count);
+ kmem_cache_free(mb_entry_cache, entry);
}
+EXPORT_SYMBOL(__mb_cache_entry_free);
-static void
-__mb_cache_entry_release(struct mb_cache_entry *ce)
+static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
+ struct mb_cache_entry *entry,
+ u32 key)
{
- /* First lock the entry to serialize access to its local data. */
- __spin_lock_mb_cache_entry(ce);
- /* Wake up all processes queuing for this cache entry. */
- if (ce->e_queued)
- wake_up_all(&mb_cache_queue);
- if (ce->e_used >= MB_CACHE_WRITER)
- ce->e_used -= MB_CACHE_WRITER;
- /*
- * Make sure that all cache entries on lru_list have
- * both e_used and e_qued of 0s.
- */
- ce->e_used--;
- if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
- if (!__mb_cache_entry_is_block_hashed(ce)) {
- __spin_unlock_mb_cache_entry(ce);
- goto forget;
+ struct mb_cache_entry *old_entry = entry;
+ struct hlist_bl_node *node;
+ struct hlist_bl_head *head;
+
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
+ node = entry->e_hash_list.next;
+ else
+ node = hlist_bl_first(head);
+ while (node) {
+ entry = hlist_bl_entry(node, struct mb_cache_entry,
+ e_hash_list);
+ if (entry->e_key == key && entry->e_reusable) {
+ atomic_inc(&entry->e_refcnt);
+ goto out;
}
- /*
- * Need access to lru list, first drop entry lock,
- * then reacquire the lock in the proper order.
- */
- spin_lock(&mb_cache_spinlock);
- if (list_empty(&ce->e_lru_list))
- list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
- spin_unlock(&mb_cache_spinlock);
+ node = node->next;
}
- __spin_unlock_mb_cache_entry(ce);
- return;
-forget:
- mb_assert(list_empty(&ce->e_lru_list));
- __mb_cache_entry_forget(ce, GFP_KERNEL);
+ entry = NULL;
+out:
+ hlist_bl_unlock(head);
+ if (old_entry)
+ mb_cache_entry_put(cache, old_entry);
+
+ return entry;
}
/*
- * mb_cache_shrink_scan() memory pressure callback
- *
- * This function is called by the kernel memory management when memory
- * gets low.
+ * mb_cache_entry_find_first - find the first entry in cache with given key
+ * @cache: cache where we should search
+ * @key: key to look for
*
- * @shrink: (ignored)
- * @sc: shrink_control passed from reclaim
- *
- * Returns the number of objects freed.
+ * Search in @cache for entry with key @key. Grabs reference to the first
+ * entry found and returns the entry.
*/
-static unsigned long
-mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
+ u32 key)
{
- LIST_HEAD(free_list);
- struct mb_cache_entry *entry, *tmp;
- int nr_to_scan = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
- unsigned long freed = 0;
-
- mb_debug("trying to free %d entries", nr_to_scan);
- spin_lock(&mb_cache_spinlock);
- while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
- struct mb_cache_entry *ce =
- list_entry(mb_cache_lru_list.next,
- struct mb_cache_entry, e_lru_list);
- list_del_init(&ce->e_lru_list);
- if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
- continue;
- spin_unlock(&mb_cache_spinlock);
- /* Prevent any find or get operation on the entry */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- /* Ignore if it is touched by a find/get */
- if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
- !list_empty(&ce->e_lru_list)) {
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_unlock(ce->e_block_hash_p);
- spin_lock(&mb_cache_spinlock);
- continue;
- }
- __mb_cache_entry_unhash_unlock(ce);
- list_add_tail(&ce->e_lru_list, &free_list);
- spin_lock(&mb_cache_spinlock);
- }
- spin_unlock(&mb_cache_spinlock);
-
- list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
- __mb_cache_entry_forget(entry, gfp_mask);
- freed++;
- }
- return freed;
+ return __entry_find(cache, NULL, key);
}
+EXPORT_SYMBOL(mb_cache_entry_find_first);
-static unsigned long
-mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+/*
+ * mb_cache_entry_find_next - find next entry in cache with the same
+ * @cache: cache where we should search
+ * @entry: entry to start search from
+ *
+ * Finds next entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races
+ * with the search), finds the first entry in the hash chain. The function
+ * drops reference to @entry and returns with a reference to the found entry.
+ */
+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
+ struct mb_cache_entry *entry)
{
- struct mb_cache *cache;
- unsigned long count = 0;
-
- spin_lock(&mb_cache_spinlock);
- list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
- mb_debug("cache %s (%d)", cache->c_name,
- atomic_read(&cache->c_entry_count));
- count += atomic_read(&cache->c_entry_count);
- }
- spin_unlock(&mb_cache_spinlock);
-
- return vfs_pressure_ratio(count);
+ return __entry_find(cache, entry, entry->e_key);
}
-
-static struct shrinker mb_cache_shrinker = {
- .count_objects = mb_cache_shrink_count,
- .scan_objects = mb_cache_shrink_scan,
- .seeks = DEFAULT_SEEKS,
-};
+EXPORT_SYMBOL(mb_cache_entry_find_next);
/*
- * mb_cache_create() create a new cache
- *
- * All entries in one cache are equal size. Cache entries may be from
- * multiple devices. If this is the first mbcache created, registers
- * the cache with kernel memory management. Returns NULL if no more
- * memory was available.
- *
- * @name: name of the cache (informal)
- * @bucket_bits: log2(number of hash buckets)
+ * mb_cache_entry_get - get a cache entry by block number (and key)
+ * @cache - cache we work with
+ * @key - key of block number @block
+ * @block - block number
*/
-struct mb_cache *
-mb_cache_create(const char *name, int bucket_bits)
+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
+ sector_t block)
{
- int n, bucket_count = 1 << bucket_bits;
- struct mb_cache *cache = NULL;
-
- if (!mb_cache_bg_lock) {
- mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
- GFP_KERNEL);
- if (!mb_cache_bg_lock)
- return NULL;
- bgl_lock_init(mb_cache_bg_lock);
- }
-
- cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
- if (!cache)
- return NULL;
- cache->c_name = name;
- atomic_set(&cache->c_entry_count, 0);
- cache->c_bucket_bits = bucket_bits;
- cache->c_block_hash = kmalloc(bucket_count *
- sizeof(struct hlist_bl_head), GFP_KERNEL);
- if (!cache->c_block_hash)
- goto fail;
- for (n=0; n<bucket_count; n++)
- INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
- cache->c_index_hash = kmalloc(bucket_count *
- sizeof(struct hlist_bl_head), GFP_KERNEL);
- if (!cache->c_index_hash)
- goto fail;
- for (n=0; n<bucket_count; n++)
- INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
- if (!mb_cache_kmem_cache) {
- mb_cache_kmem_cache = kmem_cache_create(name,
- sizeof(struct mb_cache_entry), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- if (!mb_cache_kmem_cache)
- goto fail2;
+ struct hlist_bl_node *node;
+ struct hlist_bl_head *head;
+ struct mb_cache_entry *entry;
+
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+ if (entry->e_key == key && entry->e_block == block) {
+ atomic_inc(&entry->e_refcnt);
+ goto out;
+ }
}
- cache->c_entry_cache = mb_cache_kmem_cache;
-
- /*
- * Set an upper limit on the number of cache entries so that the hash
- * chains won't grow too long.
- */
- cache->c_max_entries = bucket_count << 4;
-
- spin_lock(&mb_cache_spinlock);
- list_add(&cache->c_cache_list, &mb_cache_list);
- spin_unlock(&mb_cache_spinlock);
- return cache;
-
-fail2:
- kfree(cache->c_index_hash);
-
-fail:
- kfree(cache->c_block_hash);
- kfree(cache);
- return NULL;
+ entry = NULL;
+out:
+ hlist_bl_unlock(head);
+ return entry;
}
+EXPORT_SYMBOL(mb_cache_entry_get);
-
-/*
- * mb_cache_shrink()
- *
- * Removes all cache entries of a device from the cache. All cache entries
- * currently in use cannot be freed, and thus remain in the cache. All others
- * are freed.
+/* mb_cache_entry_delete_block - remove information about block from cache
+ * @cache - cache we work with
+ * @key - key of block @block
+ * @block - block number
*
- * @bdev: which device's cache entries to shrink
+ * Remove entry from cache @cache with key @key with data stored in @block.
*/
-void
-mb_cache_shrink(struct block_device *bdev)
+void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
+ sector_t block)
{
- LIST_HEAD(free_list);
- struct list_head *l;
- struct mb_cache_entry *ce, *tmp;
-
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- while (!list_is_last(l, &mb_cache_lru_list)) {
- l = l->next;
- ce = list_entry(l, struct mb_cache_entry, e_lru_list);
- if (ce->e_bdev == bdev) {
- list_del_init(&ce->e_lru_list);
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt))
- continue;
- spin_unlock(&mb_cache_spinlock);
- /*
- * Prevent any find or get operation on the entry.
- */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- /* Ignore if it is touched by a find/get */
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt) ||
- !list_empty(&ce->e_lru_list)) {
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_unlock(ce->e_block_hash_p);
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- continue;
+ struct hlist_bl_node *node;
+ struct hlist_bl_head *head;
+ struct mb_cache_entry *entry;
+
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+ if (entry->e_key == key && entry->e_block == block) {
+ /* We keep hash list reference to keep entry alive */
+ hlist_bl_del_init(&entry->e_hash_list);
+ hlist_bl_unlock(head);
+ spin_lock(&cache->c_list_lock);
+ if (!list_empty(&entry->e_list)) {
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ atomic_dec(&entry->e_refcnt);
}
- __mb_cache_entry_unhash_unlock(ce);
- mb_assert(!(ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt)));
- list_add_tail(&ce->e_lru_list, &free_list);
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
+ spin_unlock(&cache->c_list_lock);
+ mb_cache_entry_put(cache, entry);
+ return;
}
}
- spin_unlock(&mb_cache_spinlock);
-
- list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
- __mb_cache_entry_forget(ce, GFP_KERNEL);
- }
+ hlist_bl_unlock(head);
}
+EXPORT_SYMBOL(mb_cache_entry_delete_block);
-
-/*
- * mb_cache_destroy()
+/* mb_cache_entry_touch - cache entry got used
+ * @cache - cache the entry belongs to
+ * @entry - entry that got used
*
- * Shrinks the cache to its minimum possible size (hopefully 0 entries),
- * and then destroys it. If this was the last mbcache, un-registers the
- * mbcache from kernel memory management.
+ * Marks entry as used to give hit higher chances of surviving in cache.
*/
-void
-mb_cache_destroy(struct mb_cache *cache)
+void mb_cache_entry_touch(struct mb_cache *cache,
+ struct mb_cache_entry *entry)
{
- LIST_HEAD(free_list);
- struct mb_cache_entry *ce, *tmp;
-
- spin_lock(&mb_cache_spinlock);
- list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
- if (ce->e_cache == cache)
- list_move_tail(&ce->e_lru_list, &free_list);
- }
- list_del(&cache->c_cache_list);
- spin_unlock(&mb_cache_spinlock);
-
- list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
- list_del_init(&ce->e_lru_list);
- /*
- * Prevent any find or get operation on the entry.
- */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- mb_assert(!(ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt)));
- __mb_cache_entry_unhash_unlock(ce);
- __mb_cache_entry_forget(ce, GFP_KERNEL);
- }
-
- if (atomic_read(&cache->c_entry_count) > 0) {
- mb_error("cache %s: %d orphaned entries",
- cache->c_name,
- atomic_read(&cache->c_entry_count));
- }
-
- if (list_empty(&mb_cache_list)) {
- kmem_cache_destroy(mb_cache_kmem_cache);
- mb_cache_kmem_cache = NULL;
- }
- kfree(cache->c_index_hash);
- kfree(cache->c_block_hash);
- kfree(cache);
+ entry->e_referenced = 1;
}
+EXPORT_SYMBOL(mb_cache_entry_touch);
-/*
- * mb_cache_entry_alloc()
- *
- * Allocates a new cache entry. The new entry will not be valid initially,
- * and thus cannot be looked up yet. It should be filled with data, and
- * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
- * if no more memory was available.
- */
-struct mb_cache_entry *
-mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
+static unsigned long mb_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- struct mb_cache_entry *ce;
-
- if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
- struct list_head *l;
-
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- while (!list_is_last(l, &mb_cache_lru_list)) {
- l = l->next;
- ce = list_entry(l, struct mb_cache_entry, e_lru_list);
- if (ce->e_cache == cache) {
- list_del_init(&ce->e_lru_list);
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt))
- continue;
- spin_unlock(&mb_cache_spinlock);
- /*
- * Prevent any find or get operation on the
- * entry.
- */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- /* Ignore if it is touched by a find/get */
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt) ||
- !list_empty(&ce->e_lru_list)) {
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_unlock(ce->e_block_hash_p);
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- continue;
- }
- mb_assert(list_empty(&ce->e_lru_list));
- mb_assert(!(ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt)));
- __mb_cache_entry_unhash_unlock(ce);
- goto found;
- }
- }
- spin_unlock(&mb_cache_spinlock);
- }
+ struct mb_cache *cache = container_of(shrink, struct mb_cache,
+ c_shrink);
- ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
- if (!ce)
- return NULL;
- atomic_inc(&cache->c_entry_count);
- INIT_LIST_HEAD(&ce->e_lru_list);
- INIT_HLIST_BL_NODE(&ce->e_block_list);
- INIT_HLIST_BL_NODE(&ce->e_index.o_list);
- ce->e_cache = cache;
- ce->e_queued = 0;
- atomic_set(&ce->e_refcnt, 0);
-found:
- ce->e_block_hash_p = &cache->c_block_hash[0];
- ce->e_index_hash_p = &cache->c_index_hash[0];
- ce->e_used = 1 + MB_CACHE_WRITER;
- return ce;
+ return cache->c_entry_count;
}
-
-/*
- * mb_cache_entry_insert()
- *
- * Inserts an entry that was allocated using mb_cache_entry_alloc() into
- * the cache. After this, the cache entry can be looked up, but is not yet
- * in the lru list as the caller still holds a handle to it. Returns 0 on
- * success, or -EBUSY if a cache entry for that device + inode exists
- * already (this may happen after a failed lookup, but when another process
- * has inserted the same cache entry in the meantime).
- *
- * @bdev: device the cache entry belongs to
- * @block: block number
- * @key: lookup key
- */
-int
-mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
- sector_t block, unsigned int key)
+/* Shrink number of entries in cache */
+static unsigned long mb_cache_shrink(struct mb_cache *cache,
+ unsigned int nr_to_scan)
{
- struct mb_cache *cache = ce->e_cache;
- unsigned int bucket;
- struct hlist_bl_node *l;
- struct hlist_bl_head *block_hash_p;
- struct hlist_bl_head *index_hash_p;
- struct mb_cache_entry *lce;
-
- mb_assert(ce);
- bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
- cache->c_bucket_bits);
- block_hash_p = &cache->c_block_hash[bucket];
- hlist_bl_lock(block_hash_p);
- hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
- if (lce->e_bdev == bdev && lce->e_block == block) {
- hlist_bl_unlock(block_hash_p);
- return -EBUSY;
+ struct mb_cache_entry *entry;
+ struct hlist_bl_head *head;
+ unsigned int shrunk = 0;
+
+ spin_lock(&cache->c_list_lock);
+ while (nr_to_scan-- && !list_empty(&cache->c_list)) {
+ entry = list_first_entry(&cache->c_list,
+ struct mb_cache_entry, e_list);
+ if (entry->e_referenced) {
+ entry->e_referenced = 0;
+ list_move_tail(&cache->c_list, &entry->e_list);
+ continue;
}
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ /*
+ * We keep LRU list reference so that entry doesn't go away
+ * from under us.
+ */
+ spin_unlock(&cache->c_list_lock);
+ head = mb_cache_entry_head(cache, entry->e_key);
+ hlist_bl_lock(head);
+ if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+ hlist_bl_del_init(&entry->e_hash_list);
+ atomic_dec(&entry->e_refcnt);
+ }
+ hlist_bl_unlock(head);
+ if (mb_cache_entry_put(cache, entry))
+ shrunk++;
+ cond_resched();
+ spin_lock(&cache->c_list_lock);
}
- mb_assert(!__mb_cache_entry_is_block_hashed(ce));
- __mb_cache_entry_unhash_block(ce);
- __mb_cache_entry_unhash_index(ce);
- ce->e_bdev = bdev;
- ce->e_block = block;
- ce->e_block_hash_p = block_hash_p;
- ce->e_index.o_key = key;
- hlist_bl_add_head(&ce->e_block_list, block_hash_p);
- hlist_bl_unlock(block_hash_p);
- bucket = hash_long(key, cache->c_bucket_bits);
- index_hash_p = &cache->c_index_hash[bucket];
- hlist_bl_lock(index_hash_p);
- ce->e_index_hash_p = index_hash_p;
- hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
- hlist_bl_unlock(index_hash_p);
- return 0;
-}
+ spin_unlock(&cache->c_list_lock);
+ return shrunk;
+}
-/*
- * mb_cache_entry_release()
- *
- * Release a handle to a cache entry. When the last handle to a cache entry
- * is released it is either freed (if it is invalid) or otherwise inserted
- * in to the lru list.
- */
-void
-mb_cache_entry_release(struct mb_cache_entry *ce)
+static unsigned long mb_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- __mb_cache_entry_release(ce);
+ int nr_to_scan = sc->nr_to_scan;
+ struct mb_cache *cache = container_of(shrink, struct mb_cache,
+ c_shrink);
+ return mb_cache_shrink(cache, nr_to_scan);
}
+/* We shrink 1/X of the cache when we have too many entries in it */
+#define SHRINK_DIVISOR 16
-/*
- * mb_cache_entry_free()
- *
- */
-void
-mb_cache_entry_free(struct mb_cache_entry *ce)
+static void mb_cache_shrink_worker(struct work_struct *work)
{
- mb_assert(ce);
- mb_assert(list_empty(&ce->e_lru_list));
- hlist_bl_lock(ce->e_index_hash_p);
- __mb_cache_entry_unhash_index(ce);
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_lock(ce->e_block_hash_p);
- __mb_cache_entry_unhash_block(ce);
- hlist_bl_unlock(ce->e_block_hash_p);
- __mb_cache_entry_release(ce);
+ struct mb_cache *cache = container_of(work, struct mb_cache,
+ c_shrink_work);
+ mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
}
-
/*
- * mb_cache_entry_get()
+ * mb_cache_create - create cache
+ * @bucket_bits: log2 of the hash table size
*
- * Get a cache entry by device / block number. (There can only be one entry
- * in the cache per device and block.) Returns NULL if no such cache entry
- * exists. The returned cache entry is locked for exclusive access ("single
- * writer").
+ * Create cache for keys with 2^bucket_bits hash entries.
*/
-struct mb_cache_entry *
-mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
- sector_t block)
+struct mb_cache *mb_cache_create(int bucket_bits)
{
- unsigned int bucket;
- struct hlist_bl_node *l;
- struct mb_cache_entry *ce;
- struct hlist_bl_head *block_hash_p;
-
- bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
- cache->c_bucket_bits);
- block_hash_p = &cache->c_block_hash[bucket];
- /* First serialize access to the block corresponding hash chain. */
- hlist_bl_lock(block_hash_p);
- hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
- mb_assert(ce->e_block_hash_p == block_hash_p);
- if (ce->e_bdev == bdev && ce->e_block == block) {
- /*
- * Prevent a free from removing the entry.
- */
- atomic_inc(&ce->e_refcnt);
- hlist_bl_unlock(block_hash_p);
- __spin_lock_mb_cache_entry(ce);
- atomic_dec(&ce->e_refcnt);
- if (ce->e_used > 0) {
- DEFINE_WAIT(wait);
- while (ce->e_used > 0) {
- ce->e_queued++;
- prepare_to_wait(&mb_cache_queue, &wait,
- TASK_UNINTERRUPTIBLE);
- __spin_unlock_mb_cache_entry(ce);
- schedule();
- __spin_lock_mb_cache_entry(ce);
- ce->e_queued--;
- }
- finish_wait(&mb_cache_queue, &wait);
- }
- ce->e_used += 1 + MB_CACHE_WRITER;
- __spin_unlock_mb_cache_entry(ce);
+ struct mb_cache *cache;
+ int bucket_count = 1 << bucket_bits;
+ int i;
- if (!list_empty(&ce->e_lru_list)) {
- spin_lock(&mb_cache_spinlock);
- list_del_init(&ce->e_lru_list);
- spin_unlock(&mb_cache_spinlock);
- }
- if (!__mb_cache_entry_is_block_hashed(ce)) {
- __mb_cache_entry_release(ce);
- return NULL;
- }
- return ce;
- }
+ if (!try_module_get(THIS_MODULE))
+ return NULL;
+
+ cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
+ if (!cache)
+ goto err_out;
+ cache->c_bucket_bits = bucket_bits;
+ cache->c_max_entries = bucket_count << 4;
+ INIT_LIST_HEAD(&cache->c_list);
+ spin_lock_init(&cache->c_list_lock);
+ cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
+ GFP_KERNEL);
+ if (!cache->c_hash) {
+ kfree(cache);
+ goto err_out;
}
- hlist_bl_unlock(block_hash_p);
- return NULL;
-}
+ for (i = 0; i < bucket_count; i++)
+ INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+ cache->c_shrink.count_objects = mb_cache_count;
+ cache->c_shrink.scan_objects = mb_cache_scan;
+ cache->c_shrink.seeks = DEFAULT_SEEKS;
+ register_shrinker(&cache->c_shrink);
-static struct mb_cache_entry *
-__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
- struct block_device *bdev, unsigned int key)
-{
+ INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
- /* The index hash chain is alredy acquire by caller. */
- while (l != NULL) {
- struct mb_cache_entry *ce =
- hlist_bl_entry(l, struct mb_cache_entry,
- e_index.o_list);
- mb_assert(ce->e_index_hash_p == head);
- if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
- /*
- * Prevent a free from removing the entry.
- */
- atomic_inc(&ce->e_refcnt);
- hlist_bl_unlock(head);
- __spin_lock_mb_cache_entry(ce);
- atomic_dec(&ce->e_refcnt);
- ce->e_used++;
- /* Incrementing before holding the lock gives readers
- priority over writers. */
- if (ce->e_used >= MB_CACHE_WRITER) {
- DEFINE_WAIT(wait);
-
- while (ce->e_used >= MB_CACHE_WRITER) {
- ce->e_queued++;
- prepare_to_wait(&mb_cache_queue, &wait,
- TASK_UNINTERRUPTIBLE);
- __spin_unlock_mb_cache_entry(ce);
- schedule();
- __spin_lock_mb_cache_entry(ce);
- ce->e_queued--;
- }
- finish_wait(&mb_cache_queue, &wait);
- }
- __spin_unlock_mb_cache_entry(ce);
- if (!list_empty(&ce->e_lru_list)) {
- spin_lock(&mb_cache_spinlock);
- list_del_init(&ce->e_lru_list);
- spin_unlock(&mb_cache_spinlock);
- }
- if (!__mb_cache_entry_is_block_hashed(ce)) {
- __mb_cache_entry_release(ce);
- return ERR_PTR(-EAGAIN);
- }
- return ce;
- }
- l = l->next;
- }
- hlist_bl_unlock(head);
+ return cache;
+
+err_out:
+ module_put(THIS_MODULE);
return NULL;
}
-
+EXPORT_SYMBOL(mb_cache_create);
/*
- * mb_cache_entry_find_first()
- *
- * Find the first cache entry on a given device with a certain key in
- * an additional index. Additional matches can be found with
- * mb_cache_entry_find_next(). Returns NULL if no match was found. The
- * returned cache entry is locked for shared access ("multiple readers").
+ * mb_cache_destroy - destroy cache
+ * @cache: the cache to destroy
*
- * @cache: the cache to search
- * @bdev: the device the cache entry should belong to
- * @key: the key in the index
+ * Free all entries in cache and cache itself. Caller must make sure nobody
+ * (except shrinker) can reach @cache when calling this.
*/
-struct mb_cache_entry *
-mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
- unsigned int key)
+void mb_cache_destroy(struct mb_cache *cache)
{
- unsigned int bucket = hash_long(key, cache->c_bucket_bits);
- struct hlist_bl_node *l;
- struct mb_cache_entry *ce = NULL;
- struct hlist_bl_head *index_hash_p;
-
- index_hash_p = &cache->c_index_hash[bucket];
- hlist_bl_lock(index_hash_p);
- if (!hlist_bl_empty(index_hash_p)) {
- l = hlist_bl_first(index_hash_p);
- ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
- } else
- hlist_bl_unlock(index_hash_p);
- return ce;
-}
+ struct mb_cache_entry *entry, *next;
+ unregister_shrinker(&cache->c_shrink);
-/*
- * mb_cache_entry_find_next()
- *
- * Find the next cache entry on a given device with a certain key in an
- * additional index. Returns NULL if no match could be found. The previous
- * entry is atomatically released, so that mb_cache_entry_find_next() can
- * be called like this:
- *
- * entry = mb_cache_entry_find_first();
- * while (entry) {
- * ...
- * entry = mb_cache_entry_find_next(entry, ...);
- * }
- *
- * @prev: The previous match
- * @bdev: the device the cache entry should belong to
- * @key: the key in the index
- */
-struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev,
- struct block_device *bdev, unsigned int key)
-{
- struct mb_cache *cache = prev->e_cache;
- unsigned int bucket = hash_long(key, cache->c_bucket_bits);
- struct hlist_bl_node *l;
- struct mb_cache_entry *ce;
- struct hlist_bl_head *index_hash_p;
-
- index_hash_p = &cache->c_index_hash[bucket];
- mb_assert(prev->e_index_hash_p == index_hash_p);
- hlist_bl_lock(index_hash_p);
- mb_assert(!hlist_bl_empty(index_hash_p));
- l = prev->e_index.o_list.next;
- ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
- __mb_cache_entry_release(prev);
- return ce;
+ /*
+ * We don't bother with any locking. Cache must not be used at this
+ * point.
+ */
+ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
+ if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+ hlist_bl_del_init(&entry->e_hash_list);
+ atomic_dec(&entry->e_refcnt);
+ } else
+ WARN_ON(1);
+ list_del(&entry->e_list);
+ WARN_ON(atomic_read(&entry->e_refcnt) != 1);
+ mb_cache_entry_put(cache, entry);
+ }
+ kfree(cache->c_hash);
+ kfree(cache);
+ module_put(THIS_MODULE);
}
+EXPORT_SYMBOL(mb_cache_destroy);
-#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
-
-static int __init init_mbcache(void)
+static int __init mbcache_init(void)
{
- register_shrinker(&mb_cache_shrinker);
+ mb_entry_cache = kmem_cache_create("mbcache",
+ sizeof(struct mb_cache_entry), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+ BUG_ON(!mb_entry_cache);
return 0;
}
-static void __exit exit_mbcache(void)
+static void __exit mbcache_exit(void)
{
- unregister_shrinker(&mb_cache_shrinker);
+ kmem_cache_destroy(mb_entry_cache);
}
-module_init(init_mbcache)
-module_exit(exit_mbcache)
+module_init(mbcache_init)
+module_exit(mbcache_exit)
+MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..6bd9fd90964e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
@@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ struct page *page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
diff --git a/fs/namei.c b/fs/namei.c
index 9c590e0f66e9..1d9ca2d5dff6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1220,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd)
if (need_mntput && path->mnt == mnt)
mntput(path->mnt);
- if (ret == -EISDIR)
- ret = 0;
+ if (ret == -EISDIR || !ret)
+ ret = 1;
if (need_mntput)
nd->flags |= LOOKUP_JUMPED;
if (unlikely(ret < 0))
@@ -1444,40 +1444,26 @@ static int follow_dotdot(struct nameidata *nd)
* This looks up the name in dcache, possibly revalidates the old dentry and
* allocates a new one if not found or not valid. In the need_lookup argument
* returns whether i_op->lookup is necessary.
- *
- * dir->d_inode->i_mutex must be held
*/
-static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
- unsigned int flags, bool *need_lookup)
+static struct dentry *lookup_dcache(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
{
struct dentry *dentry;
int error;
- *need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
- if (error < 0) {
- dput(dentry);
- return ERR_PTR(error);
- } else {
+ if (!error)
d_invalidate(dentry);
- dput(dentry);
- dentry = NULL;
- }
+ dput(dentry);
+ return ERR_PTR(error);
}
}
}
-
- if (!dentry) {
- dentry = d_alloc(dir, name);
- if (unlikely(!dentry))
- return ERR_PTR(-ENOMEM);
-
- *need_lookup = true;
- }
return dentry;
}
@@ -1506,45 +1492,44 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
return dentry;
}
-static struct dentry *__lookup_hash(struct qstr *name,
+static struct dentry *__lookup_hash(const struct qstr *name,
struct dentry *base, unsigned int flags)
{
- bool need_lookup;
- struct dentry *dentry;
+ struct dentry *dentry = lookup_dcache(name, base, flags);
- dentry = lookup_dcache(name, base, flags, &need_lookup);
- if (!need_lookup)
+ if (dentry)
return dentry;
+ dentry = d_alloc(base, name);
+ if (unlikely(!dentry))
+ return ERR_PTR(-ENOMEM);
+
return lookup_real(base->d_inode, dentry, flags);
}
-/*
- * It's more convoluted than I'd like it to be, but... it's still fairly
- * small and for now I'd prefer to have fast path as straight as possible.
- * It _is_ time-critical.
- */
static int lookup_fast(struct nameidata *nd,
struct path *path, struct inode **inode,
unsigned *seqp)
{
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
- int need_reval = 1;
int status = 1;
int err;
/*
* Rename seqlock is not required here because in the off chance
- * of a false negative due to a concurrent rename, we're going to
- * do the non-racy lookup, below.
+ * of a false negative due to a concurrent rename, the caller is
+ * going to fall back to non-racy lookup.
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
bool negative;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
- if (!dentry)
- goto unlazy;
+ if (unlikely(!dentry)) {
+ if (unlazy_walk(nd, NULL, 0))
+ return -ECHILD;
+ return 0;
+ }
/*
* This sequence count validates that the inode matches
@@ -1552,7 +1537,7 @@ static int lookup_fast(struct nameidata *nd,
*/
*inode = d_backing_inode(dentry);
negative = d_is_negative(dentry);
- if (read_seqcount_retry(&dentry->d_seq, seq))
+ if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
return -ECHILD;
/*
@@ -1562,81 +1547,89 @@ static int lookup_fast(struct nameidata *nd,
* The memory barrier in read_seqcount_begin of child is
* enough, we can use __read_seqcount_retry here.
*/
- if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+ if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
return -ECHILD;
*seqp = seq;
- if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
status = d_revalidate(dentry, nd->flags);
- if (unlikely(status <= 0)) {
- if (status != -ECHILD)
- need_reval = 0;
- goto unlazy;
- }
+ if (unlikely(status <= 0)) {
+ if (unlazy_walk(nd, dentry, seq))
+ return -ECHILD;
+ if (status == -ECHILD)
+ status = d_revalidate(dentry, nd->flags);
+ } else {
+ /*
+ * Note: do negative dentry check after revalidation in
+ * case that drops it.
+ */
+ if (unlikely(negative))
+ return -ENOENT;
+ path->mnt = mnt;
+ path->dentry = dentry;
+ if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+ return 1;
+ if (unlazy_walk(nd, dentry, seq))
+ return -ECHILD;
}
- /*
- * Note: do negative dentry check after revalidation in
- * case that drops it.
- */
- if (negative)
- return -ENOENT;
- path->mnt = mnt;
- path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
- return 0;
-unlazy:
- if (unlazy_walk(nd, dentry, seq))
- return -ECHILD;
} else {
dentry = __d_lookup(parent, &nd->last);
+ if (unlikely(!dentry))
+ return 0;
+ if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+ status = d_revalidate(dentry, nd->flags);
}
-
- if (unlikely(!dentry))
- goto need_lookup;
-
- if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
- status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
- if (status < 0) {
- dput(dentry);
- return status;
- }
- d_invalidate(dentry);
+ if (!status)
+ d_invalidate(dentry);
dput(dentry);
- goto need_lookup;
+ return status;
}
-
if (unlikely(d_is_negative(dentry))) {
dput(dentry);
return -ENOENT;
}
+
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd);
- if (likely(!err))
+ if (likely(err > 0))
*inode = d_backing_inode(path->dentry);
return err;
-
-need_lookup:
- return 1;
}
/* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct path *path)
+static struct dentry *lookup_slow(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
{
- struct dentry *dentry, *parent;
-
- parent = nd->path.dentry;
- BUG_ON(nd->inode != parent->d_inode);
-
- inode_lock(parent->d_inode);
- dentry = __lookup_hash(&nd->last, parent, nd->flags);
- inode_unlock(parent->d_inode);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- path->mnt = nd->path.mnt;
- path->dentry = dentry;
- return follow_managed(path, nd);
+ struct dentry *dentry;
+ inode_lock(dir->d_inode);
+ dentry = d_lookup(dir, name);
+ if (unlikely(dentry)) {
+ if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
+ !(flags & LOOKUP_NO_REVAL)) {
+ int error = d_revalidate(dentry, flags);
+ if (unlikely(error <= 0)) {
+ if (!error)
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = ERR_PTR(error);
+ }
+ }
+ if (dentry) {
+ inode_unlock(dir->d_inode);
+ return dentry;
+ }
+ }
+ dentry = d_alloc(dir, name);
+ if (unlikely(!dentry)) {
+ inode_unlock(dir->d_inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ dentry = lookup_real(dir->d_inode, dentry, flags);
+ inode_unlock(dir->d_inode);
+ return dentry;
}
static inline int may_lookup(struct nameidata *nd)
@@ -1740,18 +1733,25 @@ static int walk_component(struct nameidata *nd, int flags)
return err;
}
err = lookup_fast(nd, &path, &inode, &seq);
- if (unlikely(err)) {
+ if (unlikely(err <= 0)) {
if (err < 0)
return err;
-
- err = lookup_slow(nd, &path);
- if (err < 0)
+ path.dentry = lookup_slow(&nd->last, nd->path.dentry,
+ nd->flags);
+ if (IS_ERR(path.dentry))
+ return PTR_ERR(path.dentry);
+
+ path.mnt = nd->path.mnt;
+ err = follow_managed(&path, nd);
+ if (unlikely(err < 0))
return err;
+ if (unlikely(d_is_negative(path.dentry))) {
+ path_to_nameidata(&path, nd);
+ return -ENOENT;
+ }
+
seq = 0; /* we are already out of RCU mode */
- err = -ENOENT;
- if (d_is_negative(path.dentry))
- goto out_path_put;
inode = d_backing_inode(path.dentry);
}
@@ -1764,10 +1764,6 @@ static int walk_component(struct nameidata *nd, int flags)
nd->inode = inode;
nd->seq = seq;
return 0;
-
-out_path_put:
- path_to_nameidata(&path, nd);
- return err;
}
/*
@@ -2373,21 +2369,9 @@ struct dentry *lookup_one_len_unlocked(const char *name,
if (err)
return ERR_PTR(err);
- /*
- * __d_lookup() is used to try to get a quick answer and avoid the
- * mutex. A false-negative does no harm.
- */
- ret = __d_lookup(base, &this);
- if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
- dput(ret);
- ret = NULL;
- }
- if (ret)
- return ret;
-
- inode_lock(base->d_inode);
- ret = __lookup_hash(&this, base, 0);
- inode_unlock(base->d_inode);
+ ret = lookup_dcache(&this, base, 0);
+ if (!ret)
+ ret = lookup_slow(&this, base, 0);
return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2465,31 +2449,21 @@ mountpoint_last(struct nameidata *nd, struct path *path)
if (error)
return error;
dentry = dget(nd->path.dentry);
- goto done;
- }
-
- inode_lock(dir->d_inode);
- dentry = d_lookup(dir, &nd->last);
- if (!dentry) {
- /*
- * No cached dentry. Mounted dentries are pinned in the cache,
- * so that means that this dentry is probably a symlink or the
- * path doesn't actually point to a mounted dentry.
- */
- dentry = d_alloc(dir, &nd->last);
+ } else {
+ dentry = d_lookup(dir, &nd->last);
if (!dentry) {
- inode_unlock(dir->d_inode);
- return -ENOMEM;
- }
- dentry = lookup_real(dir->d_inode, dentry, nd->flags);
- if (IS_ERR(dentry)) {
- inode_unlock(dir->d_inode);
- return PTR_ERR(dentry);
+ /*
+ * No cached dentry. Mounted dentries are pinned in the
+ * cache, so that means that this dentry is probably
+ * a symlink or the path doesn't actually point
+ * to a mounted dentry.
+ */
+ dentry = lookup_slow(&nd->last, dir,
+ nd->flags | LOOKUP_NO_REVAL);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
}
}
- inode_unlock(dir->d_inode);
-
-done:
if (d_is_negative(dentry)) {
dput(dentry);
return -ENOENT;
@@ -3018,16 +2992,22 @@ static int lookup_open(struct nameidata *nd, struct path *path,
struct inode *dir_inode = dir->d_inode;
struct dentry *dentry;
int error;
- bool need_lookup;
+ bool need_lookup = false;
*opened &= ~FILE_CREATED;
- dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
+ dentry = lookup_dcache(&nd->last, dir, nd->flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- /* Cached positive dentry: will open in f_op->open */
- if (!need_lookup && dentry->d_inode)
+ if (!dentry) {
+ dentry = d_alloc(dir, &nd->last);
+ if (unlikely(!dentry))
+ return -ENOMEM;
+ need_lookup = true;
+ } else if (dentry->d_inode) {
+ /* Cached positive dentry: will open in f_op->open */
goto out_no_open;
+ }
if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
return atomic_open(nd, dentry, path, file, op, got_write,
@@ -3111,13 +3091,14 @@ static int do_last(struct nameidata *nd,
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
error = lookup_fast(nd, &path, &inode, &seq);
- if (likely(!error))
+ if (likely(error > 0))
goto finish_lookup;
if (error < 0)
return error;
BUG_ON(nd->inode != dir->d_inode);
+ BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
/*
@@ -3172,12 +3153,6 @@ retry_lookup:
}
/*
- * create/update audit record if it already exists.
- */
- if (d_is_positive(path.dentry))
- audit_inode(nd->name, path.dentry, 0);
-
- /*
* If atomic_open() acquired write access it is dropped now due to
* possible mount and symlink following (this might be optimized away if
* necessary...)
@@ -3187,6 +3162,16 @@ retry_lookup:
got_write = false;
}
+ if (unlikely(d_is_negative(path.dentry))) {
+ path_to_nameidata(&path, nd);
+ return -ENOENT;
+ }
+
+ /*
+ * create/update audit record if it already exists.
+ */
+ audit_inode(nd->name, path.dentry, 0);
+
if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
path_to_nameidata(&path, nd);
return -EEXIST;
@@ -3196,12 +3181,7 @@ retry_lookup:
if (unlikely(error < 0))
return error;
- BUG_ON(nd->flags & LOOKUP_RCU);
seq = 0; /* out of RCU mode, so the value doesn't matter */
- if (unlikely(d_is_negative(path.dentry))) {
- path_to_nameidata(&path, nd);
- return -ENOENT;
- }
inode = d_backing_inode(path.dentry);
finish_lookup:
if (nd->depth)
@@ -3707,31 +3687,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
return sys_mkdirat(AT_FDCWD, pathname, mode);
}
-/*
- * The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 1 if we're the only user of this
- * dentry, and if that is true (possibly after pruning the dcache),
- * then we drop the dentry now.
- *
- * A low-level filesystem can, if it choses, legally
- * do a
- *
- * if (!d_unhashed(dentry))
- * return -EBUSY;
- *
- * if it cannot handle the case of removing a directory
- * that is still in use by something else..
- */
-void dentry_unhash(struct dentry *dentry)
-{
- shrink_dcache_parent(dentry);
- spin_lock(&dentry->d_lock);
- if (dentry->d_lockref.count == 1)
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(dentry_unhash);
-
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
int error = may_delete(dir, dentry, 1);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 26c2de2de13f..b7f8eaeea5d8 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index ddd0138f410c..02e4d87d2ed3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
kfree(bl);
}
-static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
- gfp_t gfp_flags)
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags, bool is_scsi_layout)
{
struct pnfs_block_layout *bl;
@@ -460,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
+ bl->bl_scsi_layout = is_scsi_layout;
return &bl->bl_layout;
}
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
+
static void bl_free_lseg(struct pnfs_layout_segment *lseg)
{
dprintk("%s enter\n", __func__);
@@ -743,7 +756,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
static bool
is_aligned_req(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req, unsigned int alignment)
+ struct nfs_page *req, unsigned int alignment, bool is_write)
{
/*
* Always accept buffered writes, higher layers take care of the
@@ -758,7 +771,8 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
if (IS_ALIGNED(req->wb_bytes, alignment))
return true;
- if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+ if (is_write &&
+ (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
/*
* If the write goes up to the inode size, just write
* the full page. Data past the inode size is
@@ -775,7 +789,7 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
nfs_pageio_reset_read_mds(pgio);
return;
}
@@ -791,7 +805,7 @@ static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (!is_aligned_req(pgio, req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -824,7 +838,7 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 wb_size;
- if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
nfs_pageio_reset_write_mds(pgio);
return;
}
@@ -846,7 +860,7 @@ static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (!is_aligned_req(pgio, req, PAGE_SIZE))
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -888,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.sync = pnfs_generic_sync,
};
+static struct pnfs_layoutdriver_type scsilayout_type = {
+ .id = LAYOUT_SCSI,
+ .name = "LAYOUT_SCSI",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_READ_WHOLE_PAGE,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = sl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
+};
+
+
static int __init nfs4blocklayout_init(void)
{
int ret;
dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
- ret = pnfs_register_layoutdriver(&blocklayout_type);
+ ret = bl_init_pipefs();
if (ret)
goto out;
- ret = bl_init_pipefs();
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
- goto out_unregister;
+ goto out_cleanup_pipe;
+
+ ret = pnfs_register_layoutdriver(&scsilayout_type);
+ if (ret)
+ goto out_unregister_block;
return 0;
-out_unregister:
+out_unregister_block:
pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+ bl_cleanup_pipefs();
out:
return ret;
}
@@ -913,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
- bl_cleanup_pipefs();
+ pnfs_unregister_layoutdriver(&scsilayout_type);
pnfs_unregister_layoutdriver(&blocklayout_type);
+ bl_cleanup_pipefs();
}
MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c556640dcf3b..bc21205309e0 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -55,7 +55,6 @@ struct pnfs_block_dev;
*/
#define PNFS_BLOCK_UUID_LEN 128
-
struct pnfs_block_volume {
enum pnfs_block_volume_type type;
union {
@@ -82,6 +81,13 @@ struct pnfs_block_volume {
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} stripe;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
};
};
@@ -106,6 +112,9 @@ struct pnfs_block_dev {
struct block_device *bdev;
u64 disk_offset;
+ u64 pr_key;
+ bool pr_registered;
+
bool (*map)(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map);
};
@@ -131,6 +140,7 @@ struct pnfs_block_layout {
struct rb_root bl_ext_rw;
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
+ bool bl_scsi_layout;
};
static inline struct pnfs_block_layout *
@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
dev_t bl_resolve_deviceid(struct nfs_server *server,
struct pnfs_block_volume *b, gfp_t gfp_mask);
int __init bl_init_pipefs(void);
-void __exit bl_cleanup_pipefs(void);
+void bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a861bbdfe577..e5b89675263e 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,11 +1,12 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/blkdev.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_xdr.h>
+#include <linux/pr.h>
#include "blocklayout.h"
@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
bl_free_device(&dev->children[i]);
kfree(dev->children);
} else {
+ if (dev->pr_registered) {
+ const struct pr_ops *ops =
+ dev->bdev->bd_disk->fops->pr_ops;
+ int error;
+
+ error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+ false);
+ if (error)
+ pr_err("failed to unregister PR key.\n");
+ }
+
if (dev->bdev)
blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
for (i = 0; i < b->stripe.volumes_count; i++)
b->stripe.volumes[i] = be32_to_cpup(p++);
break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ p = xdr_inline_decode(xdr, 4 + 4 + 4);
+ if (!p)
+ return -EIO;
+ b->scsi.code_set = be32_to_cpup(p++);
+ b->scsi.designator_type = be32_to_cpup(p++);
+ b->scsi.designator_len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, b->scsi.designator_len);
+ if (!p)
+ return -EIO;
+ if (b->scsi.designator_len > 256)
+ return -EIO;
+ memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+ p = xdr_inline_decode(xdr, 8);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->scsi.pr_key);
+ break;
default:
dprintk("unknown volume type!\n");
return -EIO;
@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
}
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+ switch (v->scsi.designator_type) {
+ case PS_DESIGNATOR_EUI64:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 10 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_NAA:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_T10:
+ case PS_DESIGNATOR_NAME:
+ pr_err("pNFS: unsupported designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ default:
+ pr_err("pNFS: invalid designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ }
+}
+
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ const struct pr_ops *ops;
+ const char *devname;
+ int error;
+
+ if (!bl_validate_designator(v))
+ return -EINVAL;
+
+ switch (v->scsi.designator_len) {
+ case 8:
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
+ v->scsi.designator);
+ break;
+ case 12:
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
+ v->scsi.designator);
+ break;
+ case 16:
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
+ v->scsi.designator);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
+ if (IS_ERR(d->bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(d->bdev));
+ kfree(devname);
+ return PTR_ERR(d->bdev);
+ }
+
+ kfree(devname);
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+ d->pr_key = v->scsi.pr_key;
+
+ pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+ d->bdev->bd_disk->disk_name, d->pr_key);
+
+ ops = d->bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: block device %s does not support reservations.",
+ d->bdev->bd_disk->disk_name);
+ error = -EINVAL;
+ goto out_blkdev_put;
+ }
+
+ error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for block device %s.",
+ d->bdev->bd_disk->disk_name);
+ goto out_blkdev_put;
+ }
+
+ d->pr_registered = true;
+ return 0;
+
+out_blkdev_put:
+ blkdev_put(d->bdev, FMODE_READ);
+ return error;
+}
+
static int
bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
return bl_parse_concat(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_STRIPE:
return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SCSI:
+ return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
default:
dprintk("unsupported volume type: %d\n", volumes[idx].type);
return -EIO;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 35ab51c04814..720b3ff55fa9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/vmalloc.h>
@@ -462,10 +462,12 @@ out:
return err;
}
-static size_t ext_tree_layoutupdate_size(size_t count)
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
{
- return sizeof(__be32) /* number of entries */ +
- PNFS_BLOCK_EXTENT_SIZE * count;
+ if (bl->bl_scsi_layout)
+ return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+ else
+ return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
}
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
@@ -483,6 +485,23 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
}
}
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+ return p;
+}
+
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
+
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
size_t buffer_size, size_t *count)
{
@@ -496,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
(*count)++;
- if (ext_tree_layoutupdate_size(*count) > buffer_size) {
+ if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
}
- p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
- NFS4_DEVICEID4_SIZE);
- p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, 0LL);
- *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-
+ if (bl->bl_scsi_layout)
+ p = encode_scsi_range(be, p);
+ else
+ p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
}
spin_unlock(&bl->bl_ext_lock);
@@ -537,7 +553,7 @@ retry:
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = ext_tree_layoutupdate_size(count);
+ buffer_size = ext_tree_layoutupdate_size(bl, count);
count = 0;
arg->layoutupdate_pages =
@@ -556,7 +572,7 @@ retry:
}
*start_p = cpu_to_be32(count);
- arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
void *p = start_p, *end = p + arg->layoutupdate_len;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index dbe5839cdeba..9fb067a6f7e0 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -281,7 +281,7 @@ out:
return ret;
}
-void __exit bl_cleanup_pipefs(void)
+void bl_cleanup_pipefs(void)
{
rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
unregister_pernet_subsys(&nfs4blocklayout_net_ops);
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index ff8195bd75ea..5fe1cecbf9f0 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -37,10 +37,11 @@ enum nfs4_callback_opnum {
OP_CB_ILLEGAL = 10044,
};
+struct nfs4_slot;
struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
- u32 slotid;
+ struct nfs4_slot *slot;
u32 minorversion;
struct net *net;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f0939d097406..618ced381a14 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -354,47 +354,38 @@ out:
* a single outstanding callback request at a time.
*/
static __be32
-validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
+validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
+ const struct cb_sequenceargs * args)
{
- struct nfs4_slot *slot;
-
- dprintk("%s enter. slotid %u seqid %u\n",
- __func__, args->csa_slotid, args->csa_sequenceid);
+ dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n",
+ __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr);
- if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
+ if (args->csa_slotid > tbl->server_highest_slotid)
return htonl(NFS4ERR_BADSLOT);
- slot = tbl->slots + args->csa_slotid;
- dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
-
- /* Normal */
- if (likely(args->csa_sequenceid == slot->seq_nr + 1))
- goto out_ok;
-
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
dprintk("%s seqid %u is a replay\n",
__func__, args->csa_sequenceid);
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ return htonl(NFS4ERR_DELAY);
/* Signal process_op to set this error on next op */
if (args->csa_cachethis == 0)
return htonl(NFS4ERR_RETRY_UNCACHED_REP);
- /* The ca_maxresponsesize_cached is 0 with no DRC */
- else if (args->csa_cachethis == 1)
- return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ /* Liar! We never allowed you to set csa_cachethis != 0 */
+ return htonl(NFS4ERR_SEQ_FALSE_RETRY);
}
/* Wraparound */
- if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
- slot->seq_nr = 1;
- goto out_ok;
- }
+ if (unlikely(slot->seq_nr == 0xFFFFFFFFU)) {
+ if (args->csa_sequenceid == 1)
+ return htonl(NFS4_OK);
+ } else if (likely(args->csa_sequenceid == slot->seq_nr + 1))
+ return htonl(NFS4_OK);
/* Misordered request */
return htonl(NFS4ERR_SEQ_MISORDERED);
-out_ok:
- tbl->highest_used_slotid = args->csa_slotid;
- return htonl(NFS4_OK);
}
/*
@@ -473,6 +464,12 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
tbl = &clp->cl_session->bc_slot_table;
slot = tbl->slots + args->csa_slotid;
+ /* Set up res before grabbing the spinlock */
+ memcpy(&res->csr_sessionid, &args->csa_sessionid,
+ sizeof(res->csr_sessionid));
+ res->csr_sequenceid = args->csa_sequenceid;
+ res->csr_slotid = args->csa_slotid;
+
spin_lock(&tbl->slot_tbl_lock);
/* state manager is resetting the session */
if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
@@ -485,18 +482,26 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
goto out_unlock;
}
- memcpy(&res->csr_sessionid, &args->csa_sessionid,
- sizeof(res->csr_sessionid));
- res->csr_sequenceid = args->csa_sequenceid;
- res->csr_slotid = args->csa_slotid;
- res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
- res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ status = htonl(NFS4ERR_BADSLOT);
+ slot = nfs4_lookup_slot(tbl, args->csa_slotid);
+ if (IS_ERR(slot))
+ goto out_unlock;
+
+ res->csr_highestslotid = tbl->server_highest_slotid;
+ res->csr_target_highestslotid = tbl->target_highest_slotid;
- status = validate_seqid(tbl, args);
+ status = validate_seqid(tbl, slot, args);
if (status)
goto out_unlock;
+ if (!nfs4_try_to_lock_slot(tbl, slot)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_unlock;
+ }
+ cps->slot = slot;
- cps->slotid = args->csa_slotid;
+ /* The ca_maxresponsesize_cached is 0 with no DRC */
+ if (args->csa_cachethis != 0)
+ return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
/*
* Check for pending referring calls. If a match is found, a
@@ -513,7 +518,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
* If CB_SEQUENCE returns an error, then the state of the slot
* (sequence ID, cached reply) MUST NOT change.
*/
- slot->seq_nr++;
+ slot->seq_nr = args->csa_sequenceid;
out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 646cdac73488..976c90608e56 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -752,7 +752,8 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
return htonl(NFS_OK);
}
-static void nfs4_callback_free_slot(struct nfs4_session *session)
+static void nfs4_callback_free_slot(struct nfs4_session *session,
+ struct nfs4_slot *slot)
{
struct nfs4_slot_table *tbl = &session->bc_slot_table;
@@ -761,15 +762,17 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
* Let the state manager know callback processing done.
* A single slot, so highest used slotid is either 0 or -1
*/
- tbl->highest_used_slotid = NFS4_NO_SLOT;
+ nfs4_free_slot(tbl, slot);
nfs4_slot_tbl_drain_complete(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
static void nfs4_cb_free_slot(struct cb_process_state *cps)
{
- if (cps->slotid != NFS4_NO_SLOT)
- nfs4_callback_free_slot(cps->clp->cl_session);
+ if (cps->slot) {
+ nfs4_callback_free_slot(cps->clp->cl_session, cps->slot);
+ cps->slot = NULL;
+ }
}
#else /* CONFIG_NFS_V4_1 */
@@ -893,7 +896,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
struct cb_process_state cps = {
.drc_status = 0,
.clp = NULL,
- .slotid = NFS4_NO_SLOT,
.net = SVC_NET(rqstp),
};
unsigned int nops = 0;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9cce67043f92..4bfa7d8bcade 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
- res = ERR_PTR(-ENAMETOOLONG);
- if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
- goto out;
+ if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
+ return ERR_PTR(-ENAMETOOLONG);
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
*/
- if (nfs_is_exclusive_create(dir, flags)) {
- d_instantiate(dentry, NULL);
- res = NULL;
- goto out;
- }
+ if (nfs_is_exclusive_create(dir, flags))
+ return NULL;
res = ERR_PTR(-ENOMEM);
fhandle = nfs_alloc_fhandle();
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 748bb813b8ec..89bf093d342a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
* nfs_file_write() that a write error occurred, and hence cause it to
* fall back to doing a synchronous write.
*/
-int
+static int
nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -263,9 +263,8 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
out:
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
-static int
+int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
@@ -273,13 +272,15 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode);
- nfs_inode_dio_wait(inode);
+ inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
+ if (!ret)
+ ret = pnfs_sync_inode(inode, !!datasync);
inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
@@ -293,6 +294,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_exit(inode, ret);
return ret;
}
+EXPORT_SYMBOL_GPL(nfs_file_fsync);
/*
* Decide whether a read/modify/write cycle may be more efficient
@@ -368,7 +370,7 @@ start:
/*
* Wait for O_DIRECT to complete
*/
- nfs_inode_dio_wait(mapping->host);
+ inode_dio_wait(mapping->host);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index eb370460ce20..add0e5a70bd6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -418,6 +418,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
pnfs_error_mark_layout_for_return(ino, lseg);
} else
pnfs_error_mark_layout_for_return(ino, lseg);
+ ds = NULL;
+ goto out;
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 86faecf8f328..33d18c411905 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -141,7 +141,7 @@ void nfs_evict_inode(struct inode *inode)
int nfs_sync_inode(struct inode *inode)
{
- nfs_inode_dio_wait(inode);
+ inode_dio_wait(inode);
return nfs_wb_all(inode);
}
EXPORT_SYMBOL_GPL(nfs_sync_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9a547aa3ec8e..565f8135ae1f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -358,7 +358,7 @@ int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
/* file.c */
-int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
+int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
loff_t nfs_file_llseek(struct file *, loff_t, int);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
@@ -515,10 +515,6 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
-static inline void nfs_inode_dio_wait(struct inode *inode)
-{
- inode_dio_wait(inode);
-}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57ca1c8039c1..22c35abbee9d 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -128,37 +128,6 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
return vfs_fsync(file, 0);
}
-static int
-nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- int ret;
- struct inode *inode = file_inode(file);
-
- trace_nfs_fsync_enter(inode);
-
- nfs_inode_dio_wait(inode);
- do {
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret != 0)
- break;
- inode_lock(inode);
- ret = nfs_file_fsync_commit(file, start, end, datasync);
- if (!ret)
- ret = pnfs_sync_inode(inode, !!datasync);
- inode_unlock(inode);
- /*
- * If nfs_file_fsync_commit detected a server reboot, then
- * resend all dirty pages that might have been covered by
- * the NFS_CONTEXT_RESEND_WRITES flag
- */
- start = 0;
- end = LLONG_MAX;
- } while (ret == -EAGAIN);
-
- trace_nfs_fsync_exit(inode, ret);
- return ret;
-}
-
#ifdef CONFIG_NFS_V4_2
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
@@ -266,7 +235,7 @@ const struct file_operations nfs4_file_operations = {
.open = nfs4_file_open,
.flush = nfs4_file_flush,
.release = nfs_file_release,
- .fsync = nfs4_file_fsync,
+ .fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 14881594dd07..327b8c34d360 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2461,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dentry = opendata->dentry;
if (d_really_is_negative(dentry)) {
- /* FIXME: Is this d_drop() ever needed? */
+ struct dentry *alias;
d_drop(dentry);
- dentry = d_add_unique(dentry, igrab(state->inode));
- if (dentry == NULL) {
- dentry = opendata->dentry;
- } else {
+ alias = d_exact_alias(dentry, state->inode);
+ if (!alias)
+ alias = d_splice_alias(igrab(state->inode), dentry);
+ /* d_splice_alias() can't fail here - it's a non-directory */
+ if (alias) {
dput(ctx->dentry);
- ctx->dentry = dentry;
+ ctx->dentry = dentry = alias;
}
nfs_set_verifier(dentry,
nfs_save_change_attribute(d_inode(opendata->dir)));
@@ -6782,13 +6783,26 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
return false;
}
+static void
+nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
+{
+}
+
+static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
+ .rpc_call_done = &nfs4_bind_one_conn_to_session_done,
+};
+
/*
- * nfs4_proc_bind_conn_to_session()
+ * nfs4_proc_bind_one_conn_to_session()
*
* The 4.1 client currently uses the same TCP connection for the
* fore and backchannel.
*/
-int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+static
+int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ struct nfs_client *clp,
+ struct rpc_cred *cred)
{
int status;
struct nfs41_bind_conn_to_session_args args = {
@@ -6803,6 +6817,14 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
.rpc_resp = &res,
.rpc_cred = cred,
};
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_xprt = xprt,
+ .callback_ops = &nfs4_bind_one_conn_to_session_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_TIMEOUT,
+ };
+ struct rpc_task *task;
dprintk("--> %s\n", __func__);
@@ -6810,7 +6832,16 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
args.dir = NFS4_CDFC4_FORE;
- status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ /* Do not set the backchannel flag unless this is clnt->cl_xprt */
+ if (xprt != rcu_access_pointer(clnt->cl_xprt))
+ args.dir = NFS4_CDFC4_FORE;
+
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task)) {
+ status = task->tk_status;
+ rpc_put_task(task);
+ } else
+ status = PTR_ERR(task);
trace_nfs4_bind_conn_to_session(clp, status);
if (status == 0) {
if (memcmp(res.sessionid.data,
@@ -6837,6 +6868,31 @@ out:
return status;
}
+struct rpc_bind_conn_calldata {
+ struct nfs_client *clp;
+ struct rpc_cred *cred;
+};
+
+static int
+nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *calldata)
+{
+ struct rpc_bind_conn_calldata *p = calldata;
+
+ return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
+}
+
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+{
+ struct rpc_bind_conn_calldata data = {
+ .clp = clp,
+ .cred = cred,
+ };
+ return rpc_clnt_iterate_for_each_xprt(clp->cl_rpcclient,
+ nfs4_proc_bind_conn_to_session_callback, &data);
+}
+
/*
* Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
* and operations we'd like to see to enable certain features in the allow map
@@ -7319,7 +7375,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->bc_attrs.max_resp_sz = PAGE_SIZE;
args->bc_attrs.max_resp_sz_cached = 0;
args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
- args->bc_attrs.max_reqs = 1;
+ args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e23366effcfb..332d06e64fa9 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -135,6 +135,43 @@ static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
return ERR_PTR(-ENOMEM);
}
+static void nfs4_lock_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ u32 slotid = slot->slot_nr;
+
+ __set_bit(slotid, tbl->used_slots);
+ if (slotid > tbl->highest_used_slotid ||
+ tbl->highest_used_slotid == NFS4_NO_SLOT)
+ tbl->highest_used_slotid = slotid;
+ slot->generation = tbl->generation;
+}
+
+/*
+ * nfs4_try_to_lock_slot - Given a slot try to allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ return false;
+ nfs4_lock_slot(tbl, slot);
+ return true;
+}
+
+/*
+ * nfs4_lookup_slot - Find a slot but don't allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
+{
+ if (slotid <= tbl->max_slotid)
+ return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ return ERR_PTR(-E2BIG);
+}
+
/*
* nfs4_alloc_slot - efficiently look for a free slot
*
@@ -153,18 +190,11 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
tbl->max_slotid + 1);
slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
- if (slotid > tbl->max_slotid)
- goto out;
- ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
- if (IS_ERR(ret))
- goto out;
- __set_bit(slotid, tbl->used_slots);
- if (slotid > tbl->highest_used_slotid ||
- tbl->highest_used_slotid == NFS4_NO_SLOT)
- tbl->highest_used_slotid = slotid;
- ret->generation = tbl->generation;
-
-out:
+ if (slotid <= tbl->max_slotid) {
+ ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ if (!IS_ERR(ret))
+ nfs4_lock_slot(tbl, ret);
+ }
dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
!IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index e3ea2c5324d6..5b51298d1d03 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -77,6 +77,8 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
unsigned int max_reqs, const char *queue);
extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
@@ -88,6 +90,12 @@ static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
}
+static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl,
+ u32 slotid)
+{
+ return !!test_bit(slotid, tbl->used_slots);
+}
+
#if defined(CONFIG_NFS_V4_1)
extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
u32 target_highest_slotid);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 81ac6480f9e7..4aaed890048f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -606,12 +606,22 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- clp = get_v3_ds_connect(mds_srv->nfs_client,
+ if (!IS_ERR(clp)) {
+ struct xprt_create xprt_args = {
+ .ident = XPRT_TRANSPORT_TCP,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ };
+ /* Add this address as an alias */
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_test_and_add_xprt, NULL);
+ } else
+ clp = get_v3_ds_connect(mds_srv->nfs_client,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, au_flavor);
- if (!IS_ERR(clp))
- break;
}
if (IS_ERR(clp)) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index a0b77fc1bd39..c9f583d7bac8 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -84,12 +84,30 @@ config NFSD_V4
If unsure, say N.
config NFSD_PNFS
- bool "NFSv4.1 server support for Parallel NFS (pNFS)"
- depends on NFSD_V4
+ bool
+
+config NFSD_BLOCKLAYOUT
+ bool "NFSv4.1 server support for pNFS block layouts"
+ depends on NFSD_V4 && BLOCK
+ select NFSD_PNFS
+ help
+ This option enables support for the exporting pNFS block layouts
+ in the kernel's NFS server. The pNFS block layout enables NFS
+ clients to directly perform I/O to block devices accesible to both
+ the server and the clients. See RFC 5663 for more details.
+
+ If unsure, say N.
+
+config NFSD_SCSILAYOUT
+ bool "NFSv4.1 server support for pNFS SCSI layouts"
+ depends on NFSD_V4 && BLOCK
+ select NFSD_PNFS
help
- This option enables support for the parallel NFS features of the
- minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
- server.
+ This option enables support for the exporting pNFS SCSI layouts
+ in the kernel's NFS server. The pNFS SCSI layout enables NFS
+ clients to directly perform I/O to SCSI devices accesible to both
+ the server and the clients. See draft-ietf-nfsv4-scsi-layout for
+ more details.
If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e120c6..3ae5f3c77e28 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
+nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d9421bd5e..e55b5242614d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,11 +1,14 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/exportfs.h>
#include <linux/genhd.h>
#include <linux/slab.h>
+#include <linux/pr.h>
#include <linux/nfsd/debug.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -13,37 +16,6 @@
#define NFSDDBG_FACILITY NFSDDBG_PNFS
-static int
-nfsd4_block_get_device_info_simple(struct super_block *sb,
- struct nfsd4_getdeviceinfo *gdp)
-{
- struct pnfs_block_deviceaddr *dev;
- struct pnfs_block_volume *b;
-
- dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
- sizeof(struct pnfs_block_volume), GFP_KERNEL);
- if (!dev)
- return -ENOMEM;
- gdp->gd_device = dev;
-
- dev->nr_volumes = 1;
- b = &dev->volumes[0];
-
- b->type = PNFS_BLOCK_VOLUME_SIMPLE;
- b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
- return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
- &b->simple.offset);
-}
-
-static __be32
-nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
- struct nfsd4_getdeviceinfo *gdp)
-{
- if (sb->s_bdev != sb->s_bdev->bd_contains)
- return nfserr_inval;
- return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
-}
-
static __be32
nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
struct nfsd4_layoutget *args)
@@ -141,20 +113,13 @@ out_layoutunavailable:
}
static __be32
-nfsd4_block_proc_layoutcommit(struct inode *inode,
- struct nfsd4_layoutcommit *lcp)
+nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
+ struct iomap *iomaps, int nr_iomaps)
{
loff_t new_size = lcp->lc_last_wr + 1;
struct iattr iattr = { .ia_valid = 0 };
- struct iomap *iomaps;
- int nr_iomaps;
int error;
- nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
- if (nr_iomaps < 0)
- return nfserrno(nr_iomaps);
-
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
lcp->lc_mtime = current_fs_time(inode->i_sb);
@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
return nfserrno(error);
}
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+ b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+ return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+ &b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (sb->s_bdev != sb->s_bdev->bd_contains)
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct iomap *iomaps;
+ int nr_iomaps;
+
+ nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
const struct nfsd4_layout_ops bl_layout_ops = {
/*
* Pretend that we send notification to the client. This is a blatant
@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
.encode_layoutget = nfsd4_block_encode_layoutget,
.proc_layoutcommit = nfsd4_block_proc_layoutcommit,
};
+#endif /* CONFIG_NFSD_BLOCKLAYOUT */
+
+#ifdef CONFIG_NFSD_SCSILAYOUT
+static int nfsd4_scsi_identify_device(struct block_device *bdev,
+ struct pnfs_block_volume *b)
+{
+ struct request_queue *q = bdev->bd_disk->queue;
+ struct request *rq;
+ size_t bufflen = 252, len, id_len;
+ u8 *buf, *d, type, assoc;
+ int error;
+
+ buf = kzalloc(bufflen, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ rq = blk_get_request(q, READ, GFP_KERNEL);
+ if (IS_ERR(rq)) {
+ error = -ENOMEM;
+ goto out_free_buf;
+ }
+ blk_rq_set_block_pc(rq);
+
+ error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
+ if (error)
+ goto out_put_request;
+
+ rq->cmd[0] = INQUIRY;
+ rq->cmd[1] = 1;
+ rq->cmd[2] = 0x83;
+ rq->cmd[3] = bufflen >> 8;
+ rq->cmd[4] = bufflen & 0xff;
+ rq->cmd_len = COMMAND_SIZE(INQUIRY);
+
+ error = blk_execute_rq(rq->q, NULL, rq, 1);
+ if (error) {
+ pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
+ rq->errors);
+ goto out_put_request;
+ }
+
+ len = (buf[2] << 8) + buf[3] + 4;
+ if (len > bufflen) {
+ pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
+ len);
+ goto out_put_request;
+ }
+
+ d = buf + 4;
+ for (d = buf + 4; d < buf + len; d += id_len + 4) {
+ id_len = d[3];
+ type = d[1] & 0xf;
+ assoc = (d[1] >> 4) & 0x3;
+
+ /*
+ * We only care about a EUI-64 and NAA designator types
+ * with LU association.
+ */
+ if (assoc != 0x00)
+ continue;
+ if (type != 0x02 && type != 0x03)
+ continue;
+ if (id_len != 8 && id_len != 12 && id_len != 16)
+ continue;
+
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type == 0x02 ?
+ PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
+ b->scsi.designator_len = id_len;
+ memcpy(b->scsi.designator, d + 4, id_len);
+
+ /*
+ * If we found a 8 or 12 byte descriptor continue on to
+ * see if a 16 byte one is available. If we find a
+ * 16 byte descriptor we're done.
+ */
+ if (id_len == 16)
+ break;
+ }
+
+out_put_request:
+ blk_put_request(rq);
+out_free_buf:
+ kfree(buf);
+ return error;
+}
+
+#define NFSD_MDS_PR_KEY 0x0100000000000000
+
+/*
+ * We use the client ID as a unique key for the reservations.
+ * This allows us to easily fence a client when recalls fail.
+ */
+static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
+{
+ return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
+}
+
+static int
+nfsd4_block_get_device_info_scsi(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+ const struct pr_ops *ops;
+ int error;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SCSI;
+ b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
+
+ error = nfsd4_scsi_identify_device(sb->s_bdev, b);
+ if (error)
+ return error;
+
+ ops = sb->s_bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: device %s does not support PRs.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for device %s.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
+ if (error) {
+ pr_err("pNFS: failed to reserve device %s.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __be32
+nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (sb->s_bdev != sb->s_bdev->bd_contains)
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
+}
+static __be32
+nfsd4_scsi_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct iomap *iomaps;
+ int nr_iomaps;
+
+ nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
+static void
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+
+ bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
+ nfsd4_scsi_pr_key(clp), 0, true);
+}
+
+const struct nfsd4_layout_ops scsi_layout_ops = {
+ /*
+ * Pretend that we send notification to the client. This is a blatant
+ * lie to force recent Linux clients to cache our device IDs.
+ * We rarely ever change the device ID, so the harm of leaking deviceids
+ * for a while isn't too bad. Unfortunately RFC5661 is a complete mess
+ * in this regard, but I filed errata 4119 for this a while ago, and
+ * hopefully the Linux client will eventually start caching deviceids
+ * without this again.
+ */
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_block_proc_layoutget,
+ .encode_layoutget = nfsd4_block_encode_layoutget,
+ .proc_layoutcommit = nfsd4_scsi_proc_layoutcommit,
+ .fence_client = nfsd4_scsi_fence_client,
+};
+#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc9bbc8..6c3b316f932e 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_encode_hyper(p, b->simple.offset);
p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+ p = xdr_reserve_space(xdr, len);
+ if (!p)
+ return -ETOOSMALL;
+
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->scsi.code_set);
+ *p++ = cpu_to_be32(b->scsi.designator_type);
+ p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
+ p = xdr_encode_hyper(p, b->scsi.pr_key);
+ break;
default:
return -ENOTSUPP;
}
@@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
- u32 nr_iomaps, expected, i;
+ u32 nr_iomaps, i;
if (len < sizeof(u32)) {
dprintk("%s: extent array too small: %u\n", __func__, len);
return -EINVAL;
}
+ len -= sizeof(u32);
+ if (len % PNFS_BLOCK_EXTENT_SIZE) {
+ dprintk("%s: extent array invalid: %u\n", __func__, len);
+ return -EINVAL;
+ }
nr_iomaps = be32_to_cpup(p++);
- expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
- if (len != expected) {
+ if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, expected);
+ __func__, len, nr_iomaps);
return -EINVAL;
}
@@ -155,3 +171,54 @@ fail:
kfree(iomaps);
return -EINVAL;
}
+
+int
+nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size)
+{
+ struct iomap *iomaps;
+ u32 nr_iomaps, expected, i;
+
+ if (len < sizeof(u32)) {
+ dprintk("%s: extent array too small: %u\n", __func__, len);
+ return -EINVAL;
+ }
+
+ nr_iomaps = be32_to_cpup(p++);
+ expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
+ if (len != expected) {
+ dprintk("%s: extent array size mismatch: %u/%u\n",
+ __func__, len, expected);
+ return -EINVAL;
+ }
+
+ iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+ if (!iomaps) {
+ dprintk("%s: failed to allocate extent array\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_iomaps; i++) {
+ u64 val;
+
+ p = xdr_decode_hyper(p, &val);
+ if (val & (block_size - 1)) {
+ dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
+ goto fail;
+ }
+ iomaps[i].offset = val;
+
+ p = xdr_decode_hyper(p, &val);
+ if (val & (block_size - 1)) {
+ dprintk("%s: unaligned length 0x%llx\n", __func__, val);
+ goto fail;
+ }
+ iomaps[i].length = val;
+ }
+
+ *iomapp = iomaps;
+ return nr_iomaps;
+fail:
+ kfree(iomaps);
+ return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 6de925fe8499..397bc7563a49 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -15,6 +15,11 @@ struct pnfs_block_extent {
enum pnfs_block_extent_state es;
};
+struct pnfs_block_range {
+ u64 foff;
+ u64 len;
+};
+
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
* Not actually limited by the protocol.
@@ -29,6 +34,13 @@ struct pnfs_block_volume {
u32 sig_len;
u8 sig[PNFS_BLOCK_UUID_LEN];
} simple;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
};
};
@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
struct nfsd4_layoutget *lgp);
int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7b755b7f785c..51c3b06e8036 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
{
__be32 nfserr;
u32 max_blocksize = svc_max_payload(rqstp);
+ unsigned long cnt = min(argp->count, max_blocksize);
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
@@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
* + 1 (xdr opaque byte count) = 26
*/
- resp->count = min(argp->count, max_blocksize);
+ resp->count = cnt;
svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
@@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
&resp->count);
if (nfserr == 0) {
struct inode *inode = d_inode(resp->fh.fh_dentry);
-
- resp->eof = (argp->offset + resp->count) >= inode->i_size;
+ resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
+ inode->i_size);
}
RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ce2d010d3b17..825c7bc8d789 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
+#include <linux/blkdev.h>
#include <linux/kmod.h>
#include <linux/file.h>
#include <linux/jhash.h>
@@ -26,7 +27,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
[LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+ [LAYOUT_SCSI] = &scsi_layout_ops,
+#endif
};
/* pNFS device ID to export fsid mapping */
@@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
if (!(exp->ex_flags & NFSEXP_PNFS))
return;
+ /*
+ * Check if the file system supports exporting a block-like layout.
+ * If the block device supports reservations prefer the SCSI layout,
+ * otherwise advertise the block layout.
+ */
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
if (sb->s_export_op->get_uuid &&
sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks)
exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+ /* overwrite block layout selection if needed */
+ if (sb->s_export_op->map_blocks &&
+ sb->s_export_op->commit_blocks &&
+ sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
+ exp->ex_layout_type = LAYOUT_SCSI;
+#endif
}
static void
@@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
- trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
printk(KERN_WARNING
"nfsd: client %s failed to respond to layout recall. "
" Fencing..\n", addr_str);
@@ -626,6 +644,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
container_of(cb, struct nfs4_layout_stateid, ls_recall);
struct nfsd_net *nn;
ktime_t now, cutoff;
+ const struct nfsd4_layout_ops *ops;
LIST_HEAD(reaplist);
@@ -661,7 +680,13 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
/*
* Unknown error or non-responding client, we'll need to fence.
*/
- nfsd4_cb_layout_fail(ls);
+ trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
+ ops = nfsd4_layout_ops[ls->ls_layout_type];
+ if (ops->fence_client)
+ ops->fence_client(ls);
+ else
+ nfsd4_cb_layout_fail(ls);
return -1;
}
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4cba7865f496..de1ff1d98bb1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -864,12 +864,10 @@ static __be32
nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_secinfo *secinfo)
{
- struct svc_fh resfh;
struct svc_export *exp;
struct dentry *dentry;
__be32 err;
- fh_init(&resfh, NFS4_FHSIZE);
err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
if (err)
return err;
@@ -878,6 +876,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&exp, &dentry);
if (err)
return err;
+ fh_unlock(&cstate->current_fh);
if (d_really_is_negative(dentry)) {
exp_put(exp);
err = nfserr_noent;
@@ -1269,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
goto out;
nfserr = nfs_ok;
- if (gdp->gd_maxcount != 0)
- nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+ if (gdp->gd_maxcount != 0) {
+ nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+ cstate->session->se_client, gdp);
+ }
gdp->gd_notify_types &= ops->notify_types;
out:
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index dc8ebecf5618..66eaeb1e8c2c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,10 +32,10 @@
*
*/
+#include <crypto/hash.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
-#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/module.h>
@@ -104,29 +104,35 @@ static int
nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
{
struct xdr_netobj cksum;
- struct hash_desc desc;
- struct scatterlist sg;
+ struct crypto_shash *tfm;
int status;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm)) {
- status = PTR_ERR(desc.tfm);
+ tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(tfm)) {
+ status = PTR_ERR(tfm);
goto out_no_tfm;
}
- cksum.len = crypto_hash_digestsize(desc.tfm);
+ cksum.len = crypto_shash_digestsize(tfm);
cksum.data = kmalloc(cksum.len, GFP_KERNEL);
if (cksum.data == NULL) {
status = -ENOMEM;
goto out;
}
- sg_init_one(&sg, clname->data, clname->len);
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ status = crypto_shash_digest(desc, clname->data, clname->len,
+ cksum.data);
+ shash_desc_zero(desc);
+ }
- status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
if (status)
goto out;
@@ -135,7 +141,7 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
status = 0;
out:
kfree(cksum.data);
- crypto_free_hash(desc.tfm);
+ crypto_free_shash(tfm);
out_no_tfm:
return status;
}
@@ -1260,6 +1266,7 @@ nfsd4_umh_cltrack_init(struct net *net)
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
+ kfree(grace_start);
return -EINVAL;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c484a2b6cd10..0462eeddfff9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2408,7 +2408,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
default: /* checked by xdr code */
WARN_ON_ONCE(1);
case SP4_SSV:
- return nfserr_encr_alg_unsupp;
+ status = nfserr_encr_alg_unsupp;
+ goto out_nolock;
}
/* Cases below refer to rfc 5661 section 18.35.4: */
@@ -2586,21 +2587,26 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
return nfs_ok;
}
+/*
+ * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now.
+ * These are based on similar macros in linux/sunrpc/msg_prot.h .
+ */
+#define RPC_MAX_HEADER_WITH_AUTH_SYS \
+ (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK))
+
+#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \
+ (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK))
+
#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \
- RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+ RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32))
#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \
- RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+ RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \
+ sizeof(__be32))
static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
{
ca->headerpadsz = 0;
- /*
- * These RPC_MAX_HEADER macros are overkill, especially since we
- * don't even do gss on the backchannel yet. But this is still
- * less than 1k. Tighten up this estimate in the unlikely event
- * it turns out to be a problem for some client:
- */
if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
return nfserr_toosmall;
if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
@@ -2710,10 +2716,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
status = nfs_ok;
- /*
- * We do not support RDMA or persistent sessions
- */
+ /* Persistent sessions are not supported */
cr_ses->flags &= ~SESSION4_PERSIST;
+ /* Upshifting from TCP to RDMA is not supported */
cr_ses->flags &= ~SESSION4_RDMA;
init_session(rqstp, new, conf, cr_ses);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d6ef0955a979..9df898ba648f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
READ_BUF(4);
rename->rn_snamelen = be32_to_cpup(p++);
- READ_BUF(rename->rn_snamelen + 4);
+ READ_BUF(rename->rn_snamelen);
SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+ READ_BUF(4);
rename->rn_tnamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
@@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
READ_BUF(8);
setclientid->se_callback_prog = be32_to_cpup(p++);
setclientid->se_callback_netid_len = be32_to_cpup(p++);
-
- READ_BUF(setclientid->se_callback_netid_len + 4);
+ READ_BUF(setclientid->se_callback_netid_len);
SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+ READ_BUF(4);
setclientid->se_callback_addr_len = be32_to_cpup(p++);
- READ_BUF(setclientid->se_callback_addr_len + 4);
+ READ_BUF(setclientid->se_callback_addr_len);
SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+ READ_BUF(4);
setclientid->se_callback_ident = be32_to_cpup(p++);
DECODE_TAIL;
@@ -1835,8 +1837,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
READ_BUF(4);
argp->taglen = be32_to_cpup(p++);
- READ_BUF(argp->taglen + 8);
+ READ_BUF(argp->taglen);
SAVEMEM(argp->tag, argp->taglen);
+ READ_BUF(8);
argp->minorversion = be32_to_cpup(p++);
argp->opcnt = be32_to_cpup(p++);
max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
@@ -3060,7 +3063,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
NFS4_MAX_SESSIONID_LEN);
*p++ = cpu_to_be32(bcts->dir);
- /* Sorry, we do not yet support RDMA over 4.1: */
+ /* Upshifting from TCP to RDMA is not supported */
*p++ = cpu_to_be32(0);
}
return nfserr;
@@ -3362,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read(
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
u32 eof;
+ long len;
int space_left;
__be32 nfserr;
__be32 *p = xdr->p - 2;
@@ -3370,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read(
if (xdr->end - xdr->p < 1)
return nfserr_resource;
+ len = maxcount;
nfserr = nfsd_splice_read(read->rd_rqstp, file,
read->rd_offset, &maxcount);
if (nfserr) {
@@ -3382,8 +3387,8 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}
- eof = (read->rd_offset + maxcount >=
- d_inode(read->rd_fhp->fh_dentry)->i_size);
+ eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
*(p++) = htonl(eof);
*(p++) = htonl(maxcount);
@@ -3453,14 +3458,15 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
}
read->rd_vlen = v;
+ len = maxcount;
nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
read->rd_vlen, &maxcount);
if (nfserr)
return nfserr;
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
- eof = (read->rd_offset + maxcount >=
- d_inode(read->rd_fhp->fh_dentry)->i_size);
+ eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
tmp = htonl(eof);
write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d4c4453674c6..7d073b9b1553 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
u32 notify_types;
__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
struct nfsd4_getdeviceinfo *gdevp);
@@ -32,10 +33,17 @@ struct nfsd4_layout_ops {
__be32 (*proc_layoutcommit)(struct inode *inode,
struct nfsd4_layoutcommit *lcp);
+
+ void (*fence_client)(struct nfs4_layout_stateid *ls);
};
extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
extern const struct nfsd4_layout_ops bl_layout_ops;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+extern const struct nfsd4_layout_ops scsi_layout_ops;
+#endif
__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5d2a57e4c03a..d40010e4f1a9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -870,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
oldfs = get_fs();
set_fs(KERNEL_DS);
- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+ host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
set_fs(oldfs);
return nfsd_finish_read(file, count, host_err);
}
@@ -957,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+ host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
set_fs(oldfs);
if (host_err < 0)
goto out_nfserr;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c11ba316f23f..2d573ec057f8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -139,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode)
|| createmode == NFS4_CREATE_EXCLUSIVE4_1;
}
+static inline bool nfsd_eof_on_read(long requested, long read,
+ loff_t offset, loff_t size)
+{
+ /* We assume a short read means eof: */
+ if (requested > read)
+ return true;
+ /*
+ * A non-short read might also reach end of file. The spec
+ * still requires us to set eof in that case.
+ *
+ * Further operations may have modified the file size since
+ * the read, so the following check is not atomic with the read.
+ * We've only seen that cause a problem for a client in the case
+ * where the read returned a count of 0 without setting eof.
+ * That case was fixed by the addition of the above check.
+ */
+ return (offset + read >= size);
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 45d650addd56..c20df77eff99 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page)
printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
- page, atomic_read(&page->_count),
+ page, page_ref_count(page),
(unsigned long long)page->index, page->flags, m, ino);
if (page_has_buffers(page)) {
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
quota_local.o \
quota_global.o \
xattr.o \
- acl.o
+ acl.o \
+ filecheck.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d002579c6f2b..70907d638b60 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
@@ -6253,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index cda0361e95a4..1581240a7ca0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -499,153 +499,6 @@ bail:
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- u32 cpos = 0;
- int alloc_locked = 0;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
- unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /* allocate blocks if no p_blkno is found, and create == 1 */
- if (!p_blkno && create) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- alloc_locked = 1;
-
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* fill hole, allocate blocks can't be larger than the size
- * of the hole */
- clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
- contig_blocks);
- if (clusters_to_alloc > contig_clusters)
- clusters_to_alloc = contig_clusters;
-
- /* allocate extent and insert them into the extent tree */
- ret = ocfs2_extend_allocation(inode, cpos,
- clusters_to_alloc, 0);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
- set_buffer_new(bh_result);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- if (alloc_locked)
- ocfs2_inode_unlock(inode, 1);
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
- /* Let rw unlock to be done later to protect append direct io write */
- if (offset + bytes <= i_size_read(inode)) {
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
- }
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
if (!page_has_buffers(page))
@@ -653,363 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- int ret = 0;
- u32 v_cpos = 0;
- u32 p_cpos = 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- return 1;
-
- return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset,
- u64 zero_len, int cluster_align)
-{
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- int ret = 0;
-
- if (offset <= i_size_read(inode) || cluster_align)
- return 0;
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- u64 s = i_size_read(inode);
- sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
- (do_div(s, osb->s_clustersize) >> 9);
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
- zero_len >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- u64 zero_start, zero_len, total_zero_len;
- u32 p_cpos = 0, clusters_to_add;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- u32 size_div, offset_div;
- int ret = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- offset_div = do_div(o, osb->s_clustersize);
- size_div = do_div(s, osb->s_clustersize);
- }
-
- if (offset <= i_size_read(inode))
- return 0;
-
- clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
- ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
- total_zero_len = offset - i_size_read(inode);
- if (clusters_to_add)
- total_zero_len -= offset_div;
-
- /* Allocate clusters to fill out holes, and this is only needed
- * when we add more than one clusters. Otherwise the cluster will
- * be allocated during direct IO */
- if (clusters_to_add > 1) {
- ret = ocfs2_extend_allocation(inode,
- OCFS2_I(inode)->ip_clusters,
- clusters_to_add - 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- while (total_zero_len) {
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
- size_div;
- zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
- size_div;
- zero_len = min(total_zero_len, zero_len);
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- zero_start >> 9, zero_len >> 9,
- GFP_NOFS, false);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- total_zero_len -= zero_len;
- v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
- /* Only at first iteration can be cluster not aligned.
- * So set size_div to 0 for the rest */
- size_div = 0;
- }
-
-out:
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- ssize_t ret = 0;
- ssize_t written = 0;
- bool orphaned = false;
- int is_overwrite = 0;
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- size_t count = iter->count;
- journal_t *journal = osb->journal->j_journal;
- u64 zero_len_head, zero_len_tail;
- int cluster_align_head, cluster_align_tail;
- loff_t final_size = offset + count;
- int append_write = offset >= i_size_read(inode) ? 1 : 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align_head = !zero_len_head;
-
- zero_len_tail = osb->s_clustersize -
- do_div(s, osb->s_clustersize);
- if ((offset - i_size_read(inode)) < zero_len_tail)
- zero_len_tail = offset - i_size_read(inode);
- cluster_align_tail = !zero_len_tail;
- }
-
- /*
- * when final_size > inode->i_size, inode->i_size will be
- * updated after direct write, so add the inode to orphan
- * dir first.
- */
- if (final_size > i_size_read(inode)) {
- ret = ocfs2_add_inode_to_orphan(osb, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- orphaned = true;
- }
-
- if (append_write) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- /* zeroing out the previously allocated cluster tail
- * that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
- zero_len_tail, cluster_align_tail);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- } else {
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
- offset);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
- if (is_overwrite < 0) {
- mlog_errno(is_overwrite);
- ret = is_overwrite;
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- ocfs2_inode_unlock(inode, 1);
- }
-
- written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
- if ((written < 0) && (written != -EIOCBQUEUED)) {
- loff_t i_size = i_size_read(inode);
-
- if (offset + count > i_size) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- if (i_size == i_size_read(inode)) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
- goto clean_orphan;
- }
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-
- ret = jbd2_journal_force_commit(journal);
- if (ret < 0)
- mlog_errno(ret);
- }
- } else if (written > 0 && append_write && !is_overwrite &&
- !cluster_align_head) {
- /* zeroing out the allocated cluster head */
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 0);
- goto clean_orphan;
- }
-
- BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- (u64)p_cpos << (osb->s_clustersize_bits - 9),
- zero_len_head >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 0);
- }
-
-clean_orphan:
- if (orphaned) {
- int tmp_ret;
- int update_isize = written > 0 ? 1 : 0;
- loff_t end = update_isize ? offset + written : 0;
-
- tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- goto out;
- }
-
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
- update_isize, end);
- if (tmp_ret < 0) {
- ocfs2_inode_unlock(inode, 1);
- ret = tmp_ret;
- mlog_errno(ret);
- brelse(di_bh);
- goto out;
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- tmp_ret = jbd2_journal_force_commit(journal);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(tmp_ret);
- }
- }
-
-out:
- if (ret >= 0)
- ret = written;
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending and
- * concurrent O_DIRECT writes are allowed.
- */
- if (i_size_read(inode) <= offset && !full_coherency)
- return 0;
-
- if (iov_iter_rw(iter) == READ)
- return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- else
- return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
@@ -1196,6 +692,13 @@ next_bh:
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
+
/*
* Describe the state of a single cluster to be written to.
*/
@@ -1207,7 +710,7 @@ struct ocfs2_write_cluster_desc {
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
@@ -1219,6 +722,9 @@ struct ocfs2_write_ctxt {
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -1267,6 +773,8 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1305,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
@@ -1314,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
@@ -1329,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
@@ -1336,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
@@ -1396,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
to = user_pos + user_len;
struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_page)
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (page_has_buffers(tmppage)) {
+ if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1531,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
wc->w_num_pages = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
@@ -1554,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_pages[i] = NULL;
+ continue;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1578,19 +1114,20 @@ out:
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
@@ -1600,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1619,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
@@ -1631,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
for(i = 0; i < wc->w_num_pages; i++) {
int tmpret;
+ /* This is the direct io target page. */
+ if (wc->w_pages[i] == NULL) {
+ p_blkno++;
+ continue;
+ }
+
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
wc->w_pages[i], cpos,
user_pos, user_len,
@@ -1701,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
@@ -1773,6 +1314,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
}
/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
+ }
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ if (new)
+ kfree(new);
+ return ret;
+}
+
+/*
* Populate each single-cluster write descriptor in the write context
* with information about the i/o to be done.
*
@@ -1847,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
@@ -2017,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
@@ -2072,9 +1682,8 @@ out:
return ret;
}
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
@@ -2091,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
@@ -2110,14 +1719,17 @@ try_again:
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2148,7 +1760,7 @@ try_again:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_page,
clusters_to_alloc, extents_to_split);
/*
@@ -2178,17 +1790,17 @@ try_again:
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list);
-
- }
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
@@ -2255,7 +1867,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (pagep)
+ *pagep = wc->w_target_page;
*fsdata = wc;
return 0;
out_quota:
@@ -2266,7 +1879,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
ocfs2_free_alloc_context(data_ac);
@@ -2318,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ pagep, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -2376,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- copied = ret;
- mlog_errno(ret);
- goto out;
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2389,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
goto out_write_size;
}
- if (unlikely(copied < len)) {
+ if (unlikely(copied < len) && wc->w_target_page) {
if (!PageUptodate(wc->w_target_page))
copied = 0;
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
start+len);
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_page)
+ flush_dcache_page(wc->w_target_page);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
+ /* This is the direct io target page. */
+ if (tmppage == NULL)
+ continue;
+
if (tmppage == wc->w_target_page) {
from = wc->w_target_from;
to = wc->w_target_to;
@@ -2419,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ if (handle && ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(handle, inode);
block_commit_write(tmppage, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > i_size_read(inode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2447,7 +2073,8 @@ out:
*/
ocfs2_unlock_pages(wc);
- ocfs2_commit_trans(osb, handle);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2472,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /*
+ * Because we need to change file size in ocfs2_dio_end_io_write(), or
+ * we may need to add it to orphan dir. So can not fall to fast path
+ * while file size will be changed.
+ */
+ if (pos + total_len <= i_size_read(inode)) {
+ down_read(&oi->ip_alloc_sem);
+ /* This is the fast path for re-write. */
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+ up_read(&oi->ip_alloc_sem);
+
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+ }
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count++;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (ret < 0)
+ ret = -EIO;
+ return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0, locked = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+ * are in that context. */
+ if (dwc->dw_writer_pid != task_pid_nr(current)) {
+ mutex_lock(&inode->i_mutex);
+ locked = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ /* Delete orphan before acquire i_mutex. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ ocfs2_run_deallocs(osb, &dealloc);
+ if (locked)
+ mutex_unlock(&inode->i_mutex);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static int ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+
+ if (bytes <= 0)
+ return 0;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (private)
+ ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ return 0;
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t end = offset + iter->count;
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_get_block;
+ else
+ get_block = ocfs2_dio_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset, get_block,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
- OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
- set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
- clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
- test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a76b9ea7722e..bd15929b5f92 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt {
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
- unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work)
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
quorum, failed);
@@ -1445,8 +1444,8 @@ static void o2hb_region_release(struct config_item *item)
debugfs_remove(reg->hr_debug_dir);
kfree(reg->hr_db_livenodes);
kfree(reg->hr_db_regnum);
- kfree(reg->hr_debug_elapsed_time);
- kfree(reg->hr_debug_pinned);
+ kfree(reg->hr_db_elapsed_time);
+ kfree(reg->hr_db_pinned);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
@@ -2425,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long flags;
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ebe543894db0..b17d180bdc16 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -630,7 +630,6 @@ static void o2nm_cluster_release(struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- kfree(cluster->cl_group.default_groups);
kfree(cluster);
}
@@ -666,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
struct o2nm_cluster *cluster = NULL;
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
- void *defs = NULL;
/* this runs under the parent dir's i_mutex; there can be only
* one caller in here at a time */
@@ -675,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
- defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
o2hb_group = o2hb_alloc_hb_set();
- if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+ if (cluster == NULL || ns == NULL || o2hb_group == NULL)
goto out;
config_group_init_type_name(&cluster->cl_group, name,
&o2nm_cluster_type);
+ configfs_add_default_group(&ns->ns_group, &cluster->cl_group);
+
config_group_init_type_name(&ns->ns_group, "node",
&o2nm_node_group_type);
+ configfs_add_default_group(o2hb_group, &cluster->cl_group);
- cluster->cl_group.default_groups = defs;
- cluster->cl_group.default_groups[0] = &ns->ns_group;
- cluster->cl_group.default_groups[1] = o2hb_group;
- cluster->cl_group.default_groups[2] = NULL;
rwlock_init(&cluster->cl_nodes_lock);
cluster->cl_node_ip_tree = RB_ROOT;
cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
@@ -704,7 +700,6 @@ out:
kfree(cluster);
kfree(ns);
o2hb_free_hb_set(o2hb_group);
- kfree(defs);
ret = ERR_PTR(-ENOMEM);
}
@@ -714,18 +709,11 @@ out:
static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- int i;
- struct config_item *killme;
BUG_ON(o2nm_single_cluster != cluster);
o2nm_single_cluster = NULL;
- for (i = 0; cluster->cl_group.default_groups[i]; i++) {
- killme = &cluster->cl_group.default_groups[i]->cg_item;
- cluster->cl_group.default_groups[i] = NULL;
- config_item_put(killme);
- }
-
+ configfs_remove_default_groups(&cluster->cl_group);
config_item_put(item);
}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 68c607e63ff6..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_DROPPING_REF 0x00000040
#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -451,6 +452,7 @@ enum {
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
+ DLM_DEREF_LOCKRES_DONE = 522,
};
struct dlm_reco_node_data
@@ -545,7 +547,7 @@ struct dlm_master_requery
* };
*
* from ../cluster/tcp.h
- * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
+ * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
* (roughly 4080 bytes)
* and sizeof(dlm_migratable_lockres) = 112 bytes
* and sizeof(dlm_migratable_lock) = 16 bytes
@@ -586,7 +588,7 @@ struct dlm_migratable_lockres
/* from above, 128 bytes
* for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
DLM_MIG_LOCKRES_MAX_LEN)
struct dlm_create_lock
@@ -782,6 +784,20 @@ struct dlm_deref_lockres
u8 name[O2NM_MAX_NAME_LEN];
};
+enum {
+ DLM_DEREF_RESPONSE_DONE = 0,
+ DLM_DEREF_RESPONSE_INPROG = 1,
+};
+
+struct dlm_deref_lockres_done {
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
@@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
status = DLM_RECOVERING;
else if (res->state & DLM_LOCK_RES_MIGRATING)
status = DLM_MIGRATING;
@@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
{
__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING|
DLM_LOCK_RES_MIGRATING));
}
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+ /*
+ * Move the lock to the tail because it may be the only lock which has
+ * an invalid lvb.
+ */
+ list_move_tail(&lock->list, &res->granted);
+
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..12e064b8be9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ * refmap is cleared
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 2,
+ .pv_minor = 3,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
unsigned int map_size)
{
int status, tmpstat;
- unsigned int node;
+ int node;
if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
sizeof(unsigned long))) {
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ sizeof(struct dlm_deref_lockres_done),
+ dlm_deref_lockres_done_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9477d6e1de37..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
dlm_print_one_lock_resource(res);
BUG();
}
- return ret;
+ return ret ? ret : r;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
- ret = 0;
+ ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
- return 0;
+ return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
@@ -2375,6 +2375,122 @@ done:
return ret;
}
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres_done *deref
+ = (struct dlm_deref_lockres_done *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
+ unsigned int namelen;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
+
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /* lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
+ dlm_lockres_put(res);
+
+ spin_unlock(&dlm->spinlock);
+
+done:
+ dlm_put(dlm);
+ return ret;
+}
+
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 node)
+{
+ struct dlm_deref_lockres_done deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ &deref, sizeof(deref), node, &r);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+ " to node %u\n", dlm->name, namelen,
+ lockname, ret, node);
+ } else if (r < 0) {
+ /* ignore the error */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, node, r);
+ dlm_print_one_lock_resource(res);
+ }
+}
+
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
@@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
}
spin_unlock(&res->spinlock);
+ dlm_drop_lockres_ref_done(dlm, res, node);
+
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
return 0;
/* delay migration when the lockres is in RECOCERING state */
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
if (res->owner != dlm->node_num)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b94a425f0175..f6b313898763 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
* and RECOVERY flag changed when it completes. */
hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(0, "%s: node is attempting to migrate "
+ "lockres %.*s, but marked as dropping "
+ " ref!\n", dlm->name,
+ mres->lockname_len, mres->lockname);
+ ret = -EINVAL;
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ dlm_lockres_put(res);
+ goto leave;
+ }
+
if (mres->flags & DLM_MRES_RECOVERY) {
res->state |= DLM_LOCK_RES_RECOVERING;
} else {
@@ -2071,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
@@ -2163,6 +2174,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, bucket, hash_node) {
+ if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ }
+
if (!(res->state & DLM_LOCK_RES_RECOVERING))
continue;
@@ -2300,6 +2318,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
+ res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2377,14 +2396,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "%s: res %.*s, Skip "
- "recovery as it is being freed\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- } else
- dlm_move_lockres_to_recovery_list(dlm,
- res);
-
+ mlog(0, "%s:%.*s: owned by "
+ "dead node %u, this node was "
+ "dropping its ref when it died. "
+ "continue, dropping the flag.\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ }
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
/* Another node has this resource with this node as the master */
@@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
+ if (!master && ret != 0) {
+ mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
@@ -700,7 +708,8 @@ static int dlm_thread(void *data)
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
- DLM_LOCK_RES_RECOVERING)) {
+ DLM_LOCK_RES_RECOVERING |
+ DLM_LOCK_RES_RECOVERY_WAITING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7cb38fdca229..c18ab45f8d21 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1381,44 +1381,6 @@ out:
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
@@ -2129,18 +2091,12 @@ out:
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ size_t count)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2189,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
pos,
count,
&meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
}
if (ret < 0) {
@@ -2200,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode) && !full_coherency) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Fallback to old way if the feature bit is not set.
- */
- if (end > i_size_read(inode) &&
- !ocfs2_supports_append_dio(osb)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, pos, count);
- if (ret == 1) {
- /*
- * Fallback to old way if the feature bit is not set.
- * Otherwise try dio first and then complete the rest
- * request through buffer io.
- */
- if (!ocfs2_supports_append_dio(osb))
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, appending, count,
- direct_io, has_refcount);
+ pos, count);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -2272,18 +2169,16 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level;
- int can_do_direct, has_refcount = 0;
+ int direct_io, rw_level;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from), orig_count;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
- int unaligned_dio = 0;
- int dropped_dio = 0;
+ void *saved_ki_complete = NULL;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
@@ -2296,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
- appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
inode_lock(inode);
-relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2334,7 +2227,6 @@ relock:
ocfs2_inode_unlock(inode, 1);
}
- orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
@@ -2343,41 +2235,18 @@ relock:
}
count = ret;
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
- &can_do_direct, &has_refcount);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
-
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- rw_level = -1;
-
- direct_io = 0;
- iocb->ki_flags &= ~IOCB_DIRECT;
- iov_iter_reexpand(from, orig_count);
- dropped_dio = 1;
- goto relock;
- }
-
- if (unaligned_dio) {
+ if (direct_io && !is_sync_kiocb(iocb) &&
+ ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
/*
- * Wait on previous unaligned aio to complete before
- * proceeding.
+ * Make it a sync io if it's an unaligned aio.
*/
- mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
- /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
- ocfs2_iocb_set_unaligned_aio(iocb);
+ saved_ki_complete = xchg(&iocb->ki_complete, NULL);
}
/* communicate with ocfs2_dio_end_io */
@@ -2398,14 +2267,13 @@ relock:
*/
if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- unaligned_dio = 0;
}
if (unlikely(written <= 0))
- goto no_sync;
+ goto out;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode) || dropped_dio) {
+ IS_SYNC(inode)) {
ret = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
@@ -2424,13 +2292,10 @@ relock:
iocb->ki_pos - 1);
}
-no_sync:
- if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
out:
+ if (saved_ki_complete)
+ xchg(&iocb->ki_complete, saved_ki_complete);
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..2cabbcf2f28e
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,606 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2016 SuSE. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+ "SUCCESS",
+ "FAILED",
+ "INPROGRESS",
+ "READONLY",
+ "INJBD",
+ "INVALIDINO",
+ "BLOCKECC",
+ "BLOCKNO",
+ "VALIDFLAG",
+ "GENERATION",
+ "UNSUPPORTED"
+};
+
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* Finished entry count in list */
+};
+
+struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
+ struct list_head fs_list;
+ atomic_t fs_count;
+ struct super_block *fs_sb;
+ struct kset *fs_devicekset;
+ struct kset *fs_fcheckkset;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_entry {
+ struct list_head fe_list;
+ unsigned long fe_ino;
+ unsigned int fe_type;
+ unsigned int fe_done:1;
+ unsigned int fe_status:31;
+};
+
+struct ocfs2_filecheck_args {
+ unsigned int fa_type;
+ union {
+ unsigned long fa_ino;
+ unsigned int fa_len;
+ };
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+ if (!errno)
+ return ocfs2_filecheck_errs[errno];
+
+ BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+ errno > OCFS2_FILECHECK_ERR_END);
+ return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck_chk =
+ __ATTR(check, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_fix =
+ __ATTR(fix, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_set =
+ __ATTR(set, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ if (!atomic_dec_and_test(&entry->fs_count))
+ wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&entry->fs_fcheck->fc_lock);
+ while (!list_empty(&entry->fs_fcheck->fc_head)) {
+ p = list_first_entry(&entry->fs_fcheck->fc_head,
+ struct ocfs2_filecheck_entry, fe_list);
+ list_del(&p->fe_list);
+ BUG_ON(!p->fe_done); /* To free a undone file check entry */
+ kfree(p);
+ }
+ spin_unlock(&entry->fs_fcheck->fc_lock);
+
+ kset_unregister(entry->fs_fcheckkset);
+ kset_unregister(entry->fs_devicekset);
+ kfree(entry->fs_fcheck);
+ kfree(entry);
+}
+
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ list_del(&p->fs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ ocfs2_filecheck_sysfs_free(p);
+ return 0;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return 1;
+}
+
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->fs_count))
+ wake_up_atomic_t(&entry->fs_count);
+}
+
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p = NULL;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ atomic_inc(&p->fs_count);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return p;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+ int ret = 0;
+ struct kset *device_kset = NULL;
+ struct kset *fcheck_kset = NULL;
+ struct ocfs2_filecheck *fcheck = NULL;
+ struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+ struct attribute **attrs = NULL;
+ struct attribute_group attrgp;
+
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
+ if (!attrs) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ attrs[0] = &ocfs2_attr_filecheck_chk.attr;
+ attrs[1] = &ocfs2_attr_filecheck_fix.attr;
+ attrs[2] = &ocfs2_attr_filecheck_set.attr;
+ attrs[3] = NULL;
+ memset(&attrgp, 0, sizeof(attrgp));
+ attrgp.attrs = attrs;
+ }
+
+ fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+ if (!fcheck) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+ }
+
+ if (strlen(sb->s_id) <= 0) {
+ mlog(ML_ERROR,
+ "Cannot get device basename when create filecheck sysfs\n");
+ ret = -ENODEV;
+ goto error;
+ }
+
+ device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
+ if (!device_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ fcheck_kset = kset_create_and_add("filecheck", NULL,
+ &device_kset->kobj);
+ if (!fcheck_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
+ if (ret)
+ goto error;
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ atomic_set(&entry->fs_count, 1);
+ entry->fs_sb = sb;
+ entry->fs_devicekset = device_kset;
+ entry->fs_fcheckkset = fcheck_kset;
+ entry->fs_fcheck = fcheck;
+ ocfs2_filecheck_sysfs_add(entry);
+ }
+
+ kfree(attrs);
+ return 0;
+
+error:
+ kfree(attrs);
+ kfree(entry);
+ kfree(fcheck);
+ kset_unregister(fcheck_kset);
+ kset_unregister(device_kset);
+ return ret;
+}
+
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+ return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int len)
+{
+ int ret;
+
+ if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+ return -EINVAL;
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+ mlog(ML_ERROR,
+ "Cannot set online file check maximum entry number "
+ "to %u due to too many pending entries(%u)\n",
+ len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+ ret = -EBUSY;
+ } else {
+ if (len < ent->fs_fcheck->fc_size)
+ BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+ ent->fs_fcheck->fc_size - len));
+
+ ent->fs_fcheck->fc_max = len;
+ ret = 0;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN 24
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+ unsigned long *val)
+{
+ char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+ memcpy(buffer, buf, count);
+ buffer[count] = '\0';
+
+ if (kstrtoul(buffer, 0, val))
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
+{
+ if (!strncmp(name, "fix", 4))
+ *type = OCFS2_FILECHECK_TYPE_FIX;
+ else if (!strncmp(name, "check", 6))
+ *type = OCFS2_FILECHECK_TYPE_CHK;
+ else if (!strncmp(name, "set", 4))
+ *type = OCFS2_FILECHECK_TYPE_SET;
+ else
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
+ struct ocfs2_filecheck_args *args)
+{
+ unsigned long val = 0;
+ unsigned int type;
+
+ /* too short/long args length */
+ if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
+ return 1;
+
+ if (ocfs2_filecheck_type_parse(name, &type))
+ return 1;
+ if (ocfs2_filecheck_args_get_long(buf, count, &val))
+ return 1;
+
+ if (val <= 0)
+ return 1;
+
+ args->fa_type = type;
+ if (type == OCFS2_FILECHECK_TYPE_SET)
+ args->fa_len = (unsigned int)val;
+ else
+ args->fa_ino = val;
+
+ return 0;
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ unsigned int type;
+ struct ocfs2_filecheck_entry *p;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+
+ if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
+ return -EINVAL;
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ if (type == OCFS2_FILECHECK_TYPE_SET) {
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+ goto exit;
+ }
+
+ ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
+ total += ret;
+ remain -= ret;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_type != type)
+ continue;
+
+ ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
+ p->fe_ino, p->fe_done,
+ ocfs2_filecheck_error(p->fe_status));
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+exit:
+ ocfs2_filecheck_sysfs_put(ent);
+ return total;
+}
+
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_done) {
+ list_del(&p->fe_list);
+ kfree(p);
+ ent->fs_fcheck->fc_size--;
+ ent->fs_fcheck->fc_done--;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count)
+{
+ unsigned int i = 0;
+ unsigned int ret = 0;
+
+ while (i++ < count) {
+ if (ocfs2_filecheck_erase_entry(ent))
+ ret++;
+ else
+ break;
+ }
+
+ return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ entry->fe_done = 1;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ent->fs_fcheck->fc_done++;
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned int
+ocfs2_filecheck_handle(struct super_block *sb,
+ unsigned long ino, unsigned int flags)
+{
+ unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
+ struct inode *inode = NULL;
+ int rc;
+
+ inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ if (IS_ERR(inode)) {
+ rc = (int)(-(long)inode);
+ if (rc >= OCFS2_FILECHECK_ERR_START &&
+ rc < OCFS2_FILECHECK_ERR_END)
+ ret = rc;
+ else
+ ret = OCFS2_FILECHECK_ERR_FAILED;
+ } else
+ iput(inode);
+
+ return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+ else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+ else
+ entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+ ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ocfs2_filecheck_args args;
+ struct ocfs2_filecheck_entry *entry;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+ ssize_t ret = 0;
+
+ if (count == 0)
+ return count;
+
+ if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
+ mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ return -EINVAL;
+ }
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->parent->name);
+ return -ENODEV;
+ }
+
+ if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+ ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+ goto exit;
+ }
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_ERROR,
+ "Cannot do more file check "
+ "since file check queue(%u) is full now\n",
+ ent->fs_fcheck->fc_max);
+ ret = -EBUSY;
+ kfree(entry);
+ } else {
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done > 0)) {
+ /* Delete the oldest entry which was done,
+ * make sure the entry size in list does
+ * not exceed maximum value
+ */
+ BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+ }
+
+ entry->fe_ino = args.fa_ino;
+ entry->fe_type = args.fa_type;
+ entry->fe_done = 0;
+ entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+ list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+ ent->fs_fcheck->fc_size++;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ if (!ret)
+ ocfs2_filecheck_handle_entry(ent, entry);
+
+exit:
+ ocfs2_filecheck_sysfs_put(ent);
+ return (!ret ? count : ret);
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..e5cd002a2c09
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,49 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2016 SuSE. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+ OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
+ OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
+ OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
+ OCFS2_FILECHECK_ERR_READONLY, /* Read only */
+ OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */
+ OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
+ OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
+ OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
+ OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
+ OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
+ OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+
+#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 36294446d960..12f4a9e9800f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "ocfs2_trace.h"
+#include "filecheck.h"
#include "buffer_head_io.h"
@@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+
void ocfs2_set_inode_flags(struct inode *inode)
{
unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
+ int rc = 0;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
@@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
trace_ocfs2_iget5_locked(inode->i_state);
if (inode->i_state & I_NEW) {
- ocfs2_read_locked_inode(inode, &args);
+ rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
if (is_bad_inode(inode)) {
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+ (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+ /* Return OCFS2_FILECHECK_ERR_XXX related errno */
+ inode = ERR_PTR(rc);
+ else
+ inode = ERR_PTR(-ESTALE);
goto bail;
}
@@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_super *osb;
struct ocfs2_dinode *fe;
struct buffer_head *bh = NULL;
- int status, can_lock;
+ int status, can_lock, lock_level = 0;
u32 generation = 0;
status = -EINVAL;
@@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
mlog_errno(status);
return status;
}
- status = ocfs2_inode_lock(inode, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, lock_level);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
@@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
if (can_lock) {
- status = ocfs2_read_inode_block_full(inode, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 0);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 1);
+ else
+ status = ocfs2_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
/*
* If buffer is in jbd, then its checksum may not have been
* computed as yet.
*/
- if (!status && !buffer_jbd(bh))
- status = ocfs2_validate_inode_block(osb->sb, bh);
+ if (!status && !buffer_jbd(bh)) {
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_validate_inode_block(
+ osb->sb, bh);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_repair_inode_block(
+ osb->sb, bh);
+ else
+ status = ocfs2_validate_inode_block(
+ osb->sb, bh);
+ }
}
if (status < 0) {
mlog_errno(status);
@@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode,
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+ if (buffer_dirty(bh) && !buffer_jbd(bh)) {
+ if (can_lock) {
+ ocfs2_inode_unlock(inode, lock_level);
+ lock_level = 1;
+ ocfs2_inode_lock(inode, NULL, lock_level);
+ }
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = 0;
bail:
if (can_lock)
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, lock_level);
if (status < 0)
make_bad_inode(inode);
@@ -1126,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
@@ -1397,6 +1444,169 @@ bail:
return rc;
}
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_filecheck_validate_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * Call ocfs2_validate_meta_ecc() first since it has ecc repair
+ * function, but we should not return error immediately when ecc
+ * validation fails, because the reason is quite likely the invalid
+ * inode number inputed.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Filecheck: checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+ rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+ goto bail;
+ } else if (rc)
+ goto bail;
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
+ "not set\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ rc = -OCFS2_FILECHECK_ERR_GENERATION;
+ goto bail;
+ }
+
+bail:
+ return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int changed = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ if (!ocfs2_filecheck_validate_inode_block(sb, bh))
+ return 0;
+
+ trace_ocfs2_filecheck_repair_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+ ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu "
+ "on readonly filesystem\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_READONLY;
+ }
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu, "
+ "its buffer is in jbd\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_INJBD;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ /* Cannot fix invalid inode block */
+ return -OCFS2_FILECHECK_ERR_INVALIDINO;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ /* Cannot just add VALID_FL flag back as a fix,
+ * need more things to check here.
+ */
+ return -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ di->i_blkno = cpu_to_le64(bh->b_blocknr);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ }
+
+ if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+ ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+ mark_buffer_dirty(bh);
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: compute meta ecc\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ if (!type) /* Check inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_validate_inode_block);
+ else /* Repair inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_repair_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags)
{
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
- /* Number of outstanding AIO's which are not page aligned */
- struct mutex ip_unaligned_aio;
-
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
@@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
+
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 61b833b721d8..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1326,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
@@ -1968,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -2008,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7d62c43a2c3e..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -386,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -1085,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f6e1..9ea081f4e6e4 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_page, &fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..6cf6538a0651 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count),
+ TP_ARGS(ino, saved_pos, count),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu", __entry->ino,
+ __entry->saved_pos, __entry->count)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
@@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
TP_PROTO(void *task, void *dc_task, unsigned long long ino,
@@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id);
+
DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
/* End of trace events for fs/ocfs2/quota_global.c. */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 9c9dd30bc945..3892f3c079ca 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
- queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
@@ -860,6 +860,30 @@ out:
return status;
}
+static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ int type = qid->type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ int status = 0;
+
+ trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
+ status = ocfs2_lock_global_qf(info, 0);
+ if (status < 0)
+ goto out;
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_global;
+ status = qtree_get_next_id(&info->dqi_gi, qid);
+ ocfs2_qinfo_unlock(info, 0);
+out_global:
+ ocfs2_unlock_global_qf(info, 0);
+out:
+ /* Avoid logging ENOENT since it just means there isn't next ID */
+ if (status && status != -ENOENT)
+ mlog_errno(status);
+ return status;
+}
+
static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
{
unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
@@ -968,4 +992,5 @@ const struct dquot_operations ocfs2_quota_operations = {
.write_info = ocfs2_write_info,
.alloc_dquot = ocfs2_alloc_dquot,
.destroy_dquot = ocfs2_destroy_dquot,
+ .get_next_id = ocfs2_get_next_id,
};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 576b9a04873f..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -196,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
static void ocfs2_sysfs_exit(void)
{
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
+
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index faa1365097bc..7db631e1c8b0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,17 +74,12 @@
#include "suballoc.h"
#include "buffer_head_io.h"
+#include "filecheck.h"
static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
@@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
struct ocfs2_recovery_map *rm = osb->recovery_map;
struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
int i, out = 0;
+ unsigned long flags;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
cconn->cc_version.pv_minor);
}
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
out += snprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
osb->blocked_lock_count, osb->dc_wake_sequence,
osb->dc_work_sequence);
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
@@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
+ /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+ ocfs2_filecheck_create_sysfs(sb);
+
return status;
read_super_error:
@@ -1608,33 +1607,25 @@ static int __init ocfs2_init(void)
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- goto out4;
+ goto out3;
}
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
if (status < 0)
- goto out4;
+ goto out3;
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
- debugfs_remove(ocfs2_debugfs_root);
out3:
+ debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
@@ -1645,11 +1636,6 @@ out1:
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
@@ -1667,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
+ ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1739,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
- mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -2343,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
cleancache_init_shared_fs(sb);
+ osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ }
+
bail:
return status;
}
@@ -2530,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq) {
+ flush_workqueue(osb->ocfs2_wq);
+ destroy_workqueue(osb->ocfs2_wq);
+ }
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
diff --git a/fs/open.c b/fs/open.c
index 55bdc75e2172..17cb6b1dab75 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
EXPORT_SYMBOL(filp_open);
struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
- const char *filename, int flags)
+ const char *filename, int flags, umode_t mode)
{
struct open_flags op;
- int err = build_open_flags(flags, 0, &op);
+ int err = build_open_flags(flags, mode, &op);
if (err)
return ERR_PTR(err);
- if (flags & O_CREAT)
- return ERR_PTR(-EINVAL);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig
new file mode 100644
index 000000000000..1554c02489de
--- /dev/null
+++ b/fs/orangefs/Kconfig
@@ -0,0 +1,6 @@
+config ORANGEFS_FS
+ tristate "ORANGEFS (Powered by PVFS) support"
+ select FS_POSIX_ACL
+ help
+ Orange is a parallel file system designed for use on high end
+ computing (HEC) systems.
diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile
new file mode 100644
index 000000000000..a9d6a968fe6d
--- /dev/null
+++ b/fs/orangefs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the ORANGEFS filesystem.
+#
+
+obj-$(CONFIG_ORANGEFS_FS) += orangefs.o
+
+orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \
+ dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \
+ devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \
+ orangefs-debugfs.o waitqueue.o
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
new file mode 100644
index 000000000000..03f89dbb2512
--- /dev/null
+++ b/fs/orangefs/acl.c
@@ -0,0 +1,175 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+
+struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl;
+ int ret;
+ char *key = NULL, *value = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+ break;
+ default:
+ gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
+ return ERR_PTR(-EINVAL);
+ }
+ /*
+ * Rather than incurring a network call just to determine the exact
+ * length of the attribute, I just allocate a max length to save on
+ * the network call. Conceivably, we could pass NULL to
+ * orangefs_inode_getxattr() to probe the length of the value, but
+ * I don't do that for now.
+ */
+ value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+ if (value == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "inode %pU, key %s, type %d\n",
+ get_khandle_from_ino(inode),
+ key,
+ type);
+ ret = orangefs_inode_getxattr(inode,
+ "",
+ key,
+ value,
+ ORANGEFS_MAX_XATTR_VALUELEN);
+ /* if the key exists, convert it to an in-memory rep */
+ if (ret > 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+ } else if (ret == -ENODATA || ret == -ENOSYS) {
+ acl = NULL;
+ } else {
+ gossip_err("inode %pU retrieving acl's failed with error %d\n",
+ get_khandle_from_ino(inode),
+ ret);
+ acl = ERR_PTR(ret);
+ }
+ /* kfree(NULL) is safe, so don't worry if value ever got used */
+ kfree(value);
+ return acl;
+}
+
+int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ int error = 0;
+ void *value = NULL;
+ size_t size = 0;
+ const char *name = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+ if (acl) {
+ umode_t mode = inode->i_mode;
+ /*
+ * can we represent this with the traditional file
+ * mode permission bits?
+ */
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0) {
+ gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+ __func__,
+ error);
+ return error;
+ }
+
+ if (inode->i_mode != mode)
+ SetModeFlag(orangefs_inode);
+ inode->i_mode = mode;
+ mark_inode_dirty_sync(inode);
+ if (error == 0)
+ acl = NULL;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+ break;
+ default:
+ gossip_err("%s: invalid type %d!\n", __func__, type);
+ return -EINVAL;
+ }
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "%s: inode %pU, key %s type %d\n",
+ __func__, get_khandle_from_ino(inode),
+ name,
+ type);
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (error < 0)
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "%s: name %s, value %p, size %zd, acl %p\n",
+ __func__, name, value, size, acl);
+ /*
+ * Go ahead and set the extended attribute now. NOTE: Suppose acl
+ * was NULL, then value will be NULL and size will be 0 and that
+ * will xlate to a removexattr. However, we don't want removexattr
+ * complain if attributes does not exist.
+ */
+ error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
+
+out:
+ kfree(value);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+ return error;
+}
+
+int orangefs_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct posix_acl *default_acl, *acl;
+ umode_t mode = inode->i_mode;
+ int error = 0;
+
+ ClearModeFlag(orangefs_inode);
+
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ return error;
+
+ if (default_acl) {
+ error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+
+ if (acl) {
+ if (!error)
+ error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+
+ /* If mode of the inode was changed, then do a forcible ->setattr */
+ if (mode != inode->i_mode) {
+ SetModeFlag(orangefs_inode);
+ inode->i_mode = mode;
+ orangefs_flush_inode(inode);
+ }
+
+ return error;
+}
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
new file mode 100644
index 000000000000..5dfc4f3cfe68
--- /dev/null
+++ b/fs/orangefs/dcache.c
@@ -0,0 +1,138 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Implementation of dentry (directory cache) functions.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* Returns 1 if dentry can still be trusted, else 0. */
+static int orangefs_revalidate_lookup(struct dentry *dentry)
+{
+ struct dentry *parent_dentry = dget_parent(dentry);
+ struct inode *parent_inode = parent_dentry->d_inode;
+ struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_kernel_op_s *new_op;
+ int ret = 0;
+ int err = 0;
+
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+ if (!new_op)
+ goto out_put_parent;
+
+ new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+ new_op->upcall.req.lookup.parent_refn = parent->refn;
+ strncpy(new_op->upcall.req.lookup.d_name,
+ dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d interrupt flag [%d]\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ get_interruptible_flag(parent_inode));
+
+ err = service_operation(new_op, "orangefs_lookup",
+ get_interruptible_flag(parent_inode));
+
+ /* Positive dentry: reject if error or not the same inode. */
+ if (inode) {
+ if (err) {
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d lookup failure.\n",
+ __FILE__, __func__, __LINE__);
+ goto out_drop;
+ }
+ if (!match_handle(new_op->downcall.resp.lookup.refn.khandle,
+ inode)) {
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d no match.\n",
+ __FILE__, __func__, __LINE__);
+ goto out_drop;
+ }
+
+ /* Negative dentry: reject if success or error other than ENOENT. */
+ } else {
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
+ __func__);
+ if (!err || err != -ENOENT) {
+ if (new_op->downcall.status != 0)
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d lookup failure.\n",
+ __FILE__, __func__, __LINE__);
+ goto out_drop;
+ }
+ }
+
+ ret = 1;
+out_release_op:
+ op_release(new_op);
+out_put_parent:
+ dput(parent_dentry);
+ return ret;
+out_drop:
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
+ __FILE__, __func__, __LINE__);
+ goto out_release_op;
+}
+
+/*
+ * Verify that dentry is valid.
+ *
+ * Should return 1 if dentry can still be trusted, else 0.
+ */
+static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int ret;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
+ __func__, dentry);
+
+ /* skip root handle lookups. */
+ if (dentry->d_inode && is_root_handle(dentry->d_inode))
+ return 1;
+
+ /*
+ * If this passes, the positive dentry still exists or the negative
+ * dentry still does not exist.
+ */
+ if (!orangefs_revalidate_lookup(dentry))
+ return 0;
+
+ /* We do not need to continue with negative dentries. */
+ if (!dentry->d_inode)
+ goto out;
+
+ /* Now we must perform a getattr to validate the inode contents. */
+
+ ret = orangefs_inode_check_changed(dentry->d_inode);
+ if (ret < 0) {
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n",
+ __FILE__, __func__, __LINE__);
+ return 0;
+ }
+ if (ret == 0)
+ return 0;
+
+out:
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s: negative dentry or positive dentry and inode valid.\n",
+ __func__);
+ return 1;
+}
+
+const struct dentry_operations orangefs_dentry_operations = {
+ .d_revalidate = orangefs_d_revalidate,
+};
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
new file mode 100644
index 000000000000..db170beba797
--- /dev/null
+++ b/fs/orangefs/devorangefs-req.c
@@ -0,0 +1,943 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add protocol version to kernel
+ * communication, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+/* this file implements the /dev/pvfs2-req device node */
+
+static int open_access_count;
+
+#define DUMP_DEVICE_ERROR() \
+do { \
+ gossip_err("*****************************************************\n");\
+ gossip_err("ORANGEFS Device Error: You cannot open the device file "); \
+ gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \
+ "are no ", ORANGEFS_REQDEVICE_NAME); \
+ gossip_err("instances of a program using this device\ncurrently " \
+ "running. (You must verify this!)\n"); \
+ gossip_err("For example, you can use the lsof program as follows:\n");\
+ gossip_err("'lsof | grep %s' (run this as root)\n", \
+ ORANGEFS_REQDEVICE_NAME); \
+ gossip_err(" open_access_count = %d\n", open_access_count); \
+ gossip_err("*****************************************************\n");\
+} while (0)
+
+static int hash_func(__u64 tag, int table_size)
+{
+ return do_div(tag, (unsigned int)table_size);
+}
+
+static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
+{
+ int index = hash_func(op->tag, hash_table_size);
+
+ list_add_tail(&op->list, &htable_ops_in_progress[index]);
+}
+
+/*
+ * find the op with this tag and remove it from the in progress
+ * hash table.
+ */
+static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
+{
+ struct orangefs_kernel_op_s *op, *next;
+ int index;
+
+ index = hash_func(tag, hash_table_size);
+
+ spin_lock(&htable_ops_in_progress_lock);
+ list_for_each_entry_safe(op,
+ next,
+ &htable_ops_in_progress[index],
+ list) {
+ if (op->tag == tag && !op_state_purged(op) &&
+ !op_state_given_up(op)) {
+ list_del_init(&op->list);
+ spin_unlock(&htable_ops_in_progress_lock);
+ return op;
+ }
+ }
+
+ spin_unlock(&htable_ops_in_progress_lock);
+ return NULL;
+}
+
+/* Returns whether any FS are still pending remounted */
+static int mark_all_pending_mounts(void)
+{
+ int unmounted = 1;
+ struct orangefs_sb_info_s *orangefs_sb = NULL;
+
+ spin_lock(&orangefs_superblocks_lock);
+ list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+ /* All of these file system require a remount */
+ orangefs_sb->mount_pending = 1;
+ unmounted = 0;
+ }
+ spin_unlock(&orangefs_superblocks_lock);
+ return unmounted;
+}
+
+/*
+ * Determine if a given file system needs to be remounted or not
+ * Returns -1 on error
+ * 0 if already mounted
+ * 1 if needs remount
+ */
+static int fs_mount_pending(__s32 fsid)
+{
+ int mount_pending = -1;
+ struct orangefs_sb_info_s *orangefs_sb = NULL;
+
+ spin_lock(&orangefs_superblocks_lock);
+ list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+ if (orangefs_sb->fs_id == fsid) {
+ mount_pending = orangefs_sb->mount_pending;
+ break;
+ }
+ }
+ spin_unlock(&orangefs_superblocks_lock);
+ return mount_pending;
+}
+
+static int orangefs_devreq_open(struct inode *inode, struct file *file)
+{
+ int ret = -EINVAL;
+
+ if (!(file->f_flags & O_NONBLOCK)) {
+ gossip_err("%s: device cannot be opened in blocking mode\n",
+ __func__);
+ goto out;
+ }
+ ret = -EACCES;
+ gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
+ mutex_lock(&devreq_mutex);
+
+ if (open_access_count == 0) {
+ open_access_count = 1;
+ ret = 0;
+ } else {
+ DUMP_DEVICE_ERROR();
+ }
+ mutex_unlock(&devreq_mutex);
+
+out:
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "pvfs2-client-core: open device complete (ret = %d)\n",
+ ret);
+ return ret;
+}
+
+/* Function for read() callers into the device */
+static ssize_t orangefs_devreq_read(struct file *file,
+ char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct orangefs_kernel_op_s *op, *temp;
+ __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
+ static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+ struct orangefs_kernel_op_s *cur_op = NULL;
+ unsigned long ret;
+
+ /* We do not support blocking IO. */
+ if (!(file->f_flags & O_NONBLOCK)) {
+ gossip_err("%s: blocking read from client-core.\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then
+ * always read with that size buffer.
+ */
+ if (count != MAX_DEV_REQ_UPSIZE) {
+ gossip_err("orangefs: client-core tried to read wrong size\n");
+ return -EINVAL;
+ }
+
+restart:
+ /* Get next op (if any) from top of list. */
+ spin_lock(&orangefs_request_list_lock);
+ list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
+ __s32 fsid;
+ /* This lock is held past the end of the loop when we break. */
+ spin_lock(&op->lock);
+ if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
+ spin_unlock(&op->lock);
+ continue;
+ }
+
+ fsid = fsid_of_op(op);
+ if (fsid != ORANGEFS_FS_ID_NULL) {
+ int ret;
+ /* Skip ops whose filesystem needs to be mounted. */
+ ret = fs_mount_pending(fsid);
+ if (ret == 1) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: mount pending, skipping op tag "
+ "%llu %s\n",
+ __func__,
+ llu(op->tag),
+ get_opname_string(op));
+ spin_unlock(&op->lock);
+ continue;
+ /*
+ * Skip ops whose filesystem we don't know about unless
+ * it is being mounted.
+ */
+ /* XXX: is there a better way to detect this? */
+ } else if (ret == -1 &&
+ !(op->upcall.type ==
+ ORANGEFS_VFS_OP_FS_MOUNT ||
+ op->upcall.type ==
+ ORANGEFS_VFS_OP_GETATTR)) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "orangefs: skipping op tag %llu %s\n",
+ llu(op->tag), get_opname_string(op));
+ gossip_err(
+ "orangefs: ERROR: fs_mount_pending %d\n",
+ fsid);
+ spin_unlock(&op->lock);
+ continue;
+ }
+ }
+ /*
+ * Either this op does not pertain to a filesystem, is mounting
+ * a filesystem, or pertains to a mounted filesystem. Let it
+ * through.
+ */
+ cur_op = op;
+ break;
+ }
+
+ /*
+ * At this point we either have a valid op and can continue or have not
+ * found an op and must ask the client to try again later.
+ */
+ if (!cur_op) {
+ spin_unlock(&orangefs_request_list_lock);
+ return -EAGAIN;
+ }
+
+ gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n",
+ __func__,
+ llu(cur_op->tag),
+ get_opname_string(cur_op));
+
+ /*
+ * Such an op should never be on the list in the first place. If so, we
+ * will abort.
+ */
+ if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
+ gossip_err("orangefs: ERROR: Current op already queued.\n");
+ list_del_init(&cur_op->list);
+ spin_unlock(&cur_op->lock);
+ spin_unlock(&orangefs_request_list_lock);
+ return -EAGAIN;
+ }
+
+ list_del_init(&cur_op->list);
+ spin_unlock(&orangefs_request_list_lock);
+
+ spin_unlock(&cur_op->lock);
+
+ /* Push the upcall out. */
+ ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
+ if (ret != 0)
+ goto error;
+ ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
+ if (ret != 0)
+ goto error;
+ ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
+ if (ret != 0)
+ goto error;
+ ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
+ sizeof(struct orangefs_upcall_s));
+ if (ret != 0)
+ goto error;
+
+ spin_lock(&htable_ops_in_progress_lock);
+ spin_lock(&cur_op->lock);
+ if (unlikely(op_state_given_up(cur_op))) {
+ spin_unlock(&cur_op->lock);
+ spin_unlock(&htable_ops_in_progress_lock);
+ complete(&cur_op->waitq);
+ goto restart;
+ }
+
+ /*
+ * Set the operation to be in progress and move it between lists since
+ * it has been sent to the client.
+ */
+ set_op_state_inprogress(cur_op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: 1 op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(cur_op),
+ cur_op->op_state,
+ current->comm);
+ orangefs_devreq_add_op(cur_op);
+ spin_unlock(&cur_op->lock);
+ spin_unlock(&htable_ops_in_progress_lock);
+
+ /* The client only asks to read one size buffer. */
+ return MAX_DEV_REQ_UPSIZE;
+error:
+ /*
+ * We were unable to copy the op data to the client. Put the op back in
+ * list. If client has crashed, the op will be purged later when the
+ * device is released.
+ */
+ gossip_err("orangefs: Failed to copy data to user space\n");
+ spin_lock(&orangefs_request_list_lock);
+ spin_lock(&cur_op->lock);
+ if (likely(!op_state_given_up(cur_op))) {
+ set_op_state_waiting(cur_op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: 2 op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(cur_op),
+ cur_op->op_state,
+ current->comm);
+ list_add(&cur_op->list, &orangefs_request_list);
+ spin_unlock(&cur_op->lock);
+ } else {
+ spin_unlock(&cur_op->lock);
+ complete(&cur_op->waitq);
+ }
+ spin_unlock(&orangefs_request_list_lock);
+ return -EFAULT;
+}
+
+/*
+ * Function for writev() callers into the device.
+ *
+ * Userspace should have written:
+ * - __u32 version
+ * - __u32 magic
+ * - __u64 tag
+ * - struct orangefs_downcall_s
+ * - trailer buffer (in the case of READDIR operations)
+ */
+static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ ssize_t ret;
+ struct orangefs_kernel_op_s *op = NULL;
+ struct {
+ __u32 version;
+ __u32 magic;
+ __u64 tag;
+ } head;
+ int total = ret = iov_iter_count(iter);
+ int n;
+ int downcall_size = sizeof(struct orangefs_downcall_s);
+ int head_size = sizeof(head);
+
+ gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n",
+ __func__,
+ total,
+ ret);
+
+ if (total < MAX_DEV_REQ_DOWNSIZE) {
+ gossip_err("%s: total:%d: must be at least:%u:\n",
+ __func__,
+ total,
+ (unsigned int) MAX_DEV_REQ_DOWNSIZE);
+ return -EFAULT;
+ }
+
+ n = copy_from_iter(&head, head_size, iter);
+ if (n < head_size) {
+ gossip_err("%s: failed to copy head.\n", __func__);
+ return -EFAULT;
+ }
+
+ if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
+ gossip_err("%s: userspace claims version"
+ "%d, minimum version required: %d.\n",
+ __func__,
+ head.version,
+ ORANGEFS_MINIMUM_USERSPACE_VERSION);
+ return -EPROTO;
+ }
+
+ if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
+ gossip_err("Error: Device magic number does not match.\n");
+ return -EPROTO;
+ }
+
+ /* remove the op from the in progress hash table */
+ op = orangefs_devreq_remove_op(head.tag);
+ if (!op) {
+ gossip_err("WARNING: No one's waiting for tag %llu\n",
+ llu(head.tag));
+ return ret;
+ }
+
+ n = copy_from_iter(&op->downcall, downcall_size, iter);
+ if (n != downcall_size) {
+ gossip_err("%s: failed to copy downcall.\n", __func__);
+ goto Efault;
+ }
+
+ if (op->downcall.status)
+ goto wakeup;
+
+ /*
+ * We've successfully peeled off the head and the downcall.
+ * Something has gone awry if total doesn't equal the
+ * sum of head_size, downcall_size and trailer_size.
+ */
+ if ((head_size + downcall_size + op->downcall.trailer_size) != total) {
+ gossip_err("%s: funky write, head_size:%d"
+ ": downcall_size:%d: trailer_size:%lld"
+ ": total size:%d:\n",
+ __func__,
+ head_size,
+ downcall_size,
+ op->downcall.trailer_size,
+ total);
+ goto Efault;
+ }
+
+ /* Only READDIR operations should have trailers. */
+ if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) &&
+ (op->downcall.trailer_size != 0)) {
+ gossip_err("%s: %x operation with trailer.",
+ __func__,
+ op->downcall.type);
+ goto Efault;
+ }
+
+ /* READDIR operations should always have trailers. */
+ if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) &&
+ (op->downcall.trailer_size == 0)) {
+ gossip_err("%s: %x operation with no trailer.",
+ __func__,
+ op->downcall.type);
+ goto Efault;
+ }
+
+ if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
+ goto wakeup;
+
+ op->downcall.trailer_buf =
+ vmalloc(op->downcall.trailer_size);
+ if (op->downcall.trailer_buf == NULL) {
+ gossip_err("%s: failed trailer vmalloc.\n",
+ __func__);
+ goto Enomem;
+ }
+ memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
+ n = copy_from_iter(op->downcall.trailer_buf,
+ op->downcall.trailer_size,
+ iter);
+ if (n != op->downcall.trailer_size) {
+ gossip_err("%s: failed to copy trailer.\n", __func__);
+ vfree(op->downcall.trailer_buf);
+ goto Efault;
+ }
+
+wakeup:
+ /*
+ * Return to vfs waitqueue, and back to service_operation
+ * through wait_for_matching_downcall.
+ */
+ spin_lock(&op->lock);
+ if (unlikely(op_is_cancel(op))) {
+ spin_unlock(&op->lock);
+ put_cancel(op);
+ } else if (unlikely(op_state_given_up(op))) {
+ spin_unlock(&op->lock);
+ complete(&op->waitq);
+ } else {
+ set_op_state_serviced(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ spin_unlock(&op->lock);
+ }
+ return ret;
+
+Efault:
+ op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
+ ret = -EFAULT;
+ goto wakeup;
+
+Enomem:
+ op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
+ ret = -ENOMEM;
+ goto wakeup;
+}
+
+/*
+ * NOTE: gets called when the last reference to this device is dropped.
+ * Using the open_access_count variable, we enforce a reference count
+ * on this file so that it can be opened by only one process at a time.
+ * the devreq_mutex is used to make sure all i/o has completed
+ * before we call orangefs_bufmap_finalize, and similar such tricky
+ * situations
+ */
+static int orangefs_devreq_release(struct inode *inode, struct file *file)
+{
+ int unmounted = 0;
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s:pvfs2-client-core: exiting, closing device\n",
+ __func__);
+
+ mutex_lock(&devreq_mutex);
+ orangefs_bufmap_finalize();
+
+ open_access_count = -1;
+
+ unmounted = mark_all_pending_mounts();
+ gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n",
+ (unmounted ? "UNMOUNTED" : "MOUNTED"));
+
+ purge_waiting_ops();
+ purge_inprogress_ops();
+
+ orangefs_bufmap_run_down();
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "pvfs2-client-core: device close complete\n");
+ open_access_count = 0;
+ mutex_unlock(&devreq_mutex);
+ return 0;
+}
+
+int is_daemon_in_service(void)
+{
+ int in_service;
+
+ /*
+ * What this function does is checks if client-core is alive
+ * based on the access count we maintain on the device.
+ */
+ mutex_lock(&devreq_mutex);
+ in_service = open_access_count == 1 ? 0 : -EIO;
+ mutex_unlock(&devreq_mutex);
+ return in_service;
+}
+
+bool __is_daemon_in_service(void)
+{
+ return open_access_count == 1;
+}
+
+static inline long check_ioctl_command(unsigned int command)
+{
+ /* Check for valid ioctl codes */
+ if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) {
+ gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n",
+ command,
+ _IOC_TYPE(command),
+ ORANGEFS_DEV_MAGIC);
+ return -EINVAL;
+ }
+ /* and valid ioctl commands */
+ if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) {
+ gossip_err("Invalid ioctl command number [%d >= %d]\n",
+ _IOC_NR(command), ORANGEFS_DEV_MAXNR);
+ return -ENOIOCTLCMD;
+ }
+ return 0;
+}
+
+static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
+{
+ static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+ static __s32 max_up_size = MAX_DEV_REQ_UPSIZE;
+ static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
+ struct ORANGEFS_dev_map_desc user_desc;
+ int ret = 0;
+ struct dev_mask_info_s mask_info = { 0 };
+ struct dev_mask2_info_s mask2_info = { 0, 0 };
+ int upstream_kmod = 1;
+ struct orangefs_sb_info_s *orangefs_sb;
+
+ /* mtmoore: add locking here */
+
+ switch (command) {
+ case ORANGEFS_DEV_GET_MAGIC:
+ return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ?
+ -EIO :
+ 0);
+ case ORANGEFS_DEV_GET_MAX_UPSIZE:
+ return ((put_user(max_up_size,
+ (__s32 __user *) arg) == -EFAULT) ?
+ -EIO :
+ 0);
+ case ORANGEFS_DEV_GET_MAX_DOWNSIZE:
+ return ((put_user(max_down_size,
+ (__s32 __user *) arg) == -EFAULT) ?
+ -EIO :
+ 0);
+ case ORANGEFS_DEV_MAP:
+ ret = copy_from_user(&user_desc,
+ (struct ORANGEFS_dev_map_desc __user *)
+ arg,
+ sizeof(struct ORANGEFS_dev_map_desc));
+ /* WTF -EIO and not -EFAULT? */
+ return ret ? -EIO : orangefs_bufmap_initialize(&user_desc);
+ case ORANGEFS_DEV_REMOUNT_ALL:
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: got ORANGEFS_DEV_REMOUNT_ALL\n",
+ __func__);
+
+ /*
+ * remount all mounted orangefs volumes to regain the lost
+ * dynamic mount tables (if any) -- NOTE: this is done
+ * without keeping the superblock list locked due to the
+ * upcall/downcall waiting. also, the request mutex is
+ * used to ensure that no operations will be serviced until
+ * all of the remounts are serviced (to avoid ops between
+ * mounts to fail)
+ */
+ ret = mutex_lock_interruptible(&request_mutex);
+ if (ret < 0)
+ return ret;
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: priority remount in progress\n",
+ __func__);
+ spin_lock(&orangefs_superblocks_lock);
+ list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+ /*
+ * We have to drop the spinlock, so entries can be
+ * removed. They can't be freed, though, so we just
+ * keep the forward pointers and zero the back ones -
+ * that way we can get to the rest of the list.
+ */
+ if (!orangefs_sb->list.prev)
+ continue;
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: Remounting SB %p\n",
+ __func__,
+ orangefs_sb);
+
+ spin_unlock(&orangefs_superblocks_lock);
+ ret = orangefs_remount(orangefs_sb);
+ spin_lock(&orangefs_superblocks_lock);
+ if (ret) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "SB %p remount failed\n",
+ orangefs_sb);
+ break;
+ }
+ }
+ spin_unlock(&orangefs_superblocks_lock);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: priority remount complete\n",
+ __func__);
+ mutex_unlock(&request_mutex);
+ return ret;
+
+ case ORANGEFS_DEV_UPSTREAM:
+ ret = copy_to_user((void __user *)arg,
+ &upstream_kmod,
+ sizeof(upstream_kmod));
+
+ if (ret != 0)
+ return -EIO;
+ else
+ return ret;
+
+ case ORANGEFS_DEV_CLIENT_MASK:
+ ret = copy_from_user(&mask2_info,
+ (void __user *)arg,
+ sizeof(struct dev_mask2_info_s));
+
+ if (ret != 0)
+ return -EIO;
+
+ client_debug_mask.mask1 = mask2_info.mask1_value;
+ client_debug_mask.mask2 = mask2_info.mask2_value;
+
+ pr_info("%s: client debug mask has been been received "
+ ":%llx: :%llx:\n",
+ __func__,
+ (unsigned long long)client_debug_mask.mask1,
+ (unsigned long long)client_debug_mask.mask2);
+
+ return ret;
+
+ case ORANGEFS_DEV_CLIENT_STRING:
+ ret = copy_from_user(&client_debug_array_string,
+ (void __user *)arg,
+ ORANGEFS_MAX_DEBUG_STRING_LEN);
+ /*
+ * The real client-core makes an effort to ensure
+ * that actual strings that aren't too long to fit in
+ * this buffer is what we get here. We're going to use
+ * string functions on the stuff we got, so we'll make
+ * this extra effort to try and keep from
+ * flowing out of this buffer when we use the string
+ * functions, even if somehow the stuff we end up
+ * with here is garbage.
+ */
+ client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
+ '\0';
+
+ if (ret != 0) {
+ pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+ __func__);
+ return -EIO;
+ }
+
+ pr_info("%s: client debug array string has been received.\n",
+ __func__);
+
+ if (!help_string_initialized) {
+
+ /* Free the "we don't know yet" default string... */
+ kfree(debug_help_string);
+
+ /* build a proper debug help string */
+ if (orangefs_prepare_debugfs_help_string(0)) {
+ gossip_err("%s: no debug help string \n",
+ __func__);
+ return -EIO;
+ }
+
+ /* Replace the boilerplate boot-time debug-help file. */
+ debugfs_remove(help_file_dentry);
+
+ help_file_dentry =
+ debugfs_create_file(
+ ORANGEFS_KMOD_DEBUG_HELP_FILE,
+ 0444,
+ debug_dir,
+ debug_help_string,
+ &debug_help_fops);
+
+ if (!help_file_dentry) {
+ gossip_err("%s: debugfs_create_file failed for"
+ " :%s:!\n",
+ __func__,
+ ORANGEFS_KMOD_DEBUG_HELP_FILE);
+ return -EIO;
+ }
+ }
+
+ debug_mask_to_string(&client_debug_mask, 1);
+
+ debugfs_remove(client_debug_dentry);
+
+ orangefs_client_debug_init();
+
+ help_string_initialized++;
+
+ return ret;
+
+ case ORANGEFS_DEV_DEBUG:
+ ret = copy_from_user(&mask_info,
+ (void __user *)arg,
+ sizeof(mask_info));
+
+ if (ret != 0)
+ return -EIO;
+
+ if (mask_info.mask_type == KERNEL_MASK) {
+ if ((mask_info.mask_value == 0)
+ && (kernel_mask_set_mod_init)) {
+ /*
+ * the kernel debug mask was set when the
+ * kernel module was loaded; don't override
+ * it if the client-core was started without
+ * a value for ORANGEFS_KMODMASK.
+ */
+ return 0;
+ }
+ debug_mask_to_string(&mask_info.mask_value,
+ mask_info.mask_type);
+ gossip_debug_mask = mask_info.mask_value;
+ pr_info("%s: kernel debug mask has been modified to "
+ ":%s: :%llx:\n",
+ __func__,
+ kernel_debug_string,
+ (unsigned long long)gossip_debug_mask);
+ } else if (mask_info.mask_type == CLIENT_MASK) {
+ debug_mask_to_string(&mask_info.mask_value,
+ mask_info.mask_type);
+ pr_info("%s: client debug mask has been modified to"
+ ":%s: :%llx:\n",
+ __func__,
+ client_debug_string,
+ llu(mask_info.mask_value));
+ } else {
+ gossip_lerr("Invalid mask type....\n");
+ return -EINVAL;
+ }
+
+ return ret;
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return -ENOIOCTLCMD;
+}
+
+static long orangefs_devreq_ioctl(struct file *file,
+ unsigned int command, unsigned long arg)
+{
+ long ret;
+
+ /* Check for properly constructed commands */
+ ret = check_ioctl_command(command);
+ if (ret < 0)
+ return (int)ret;
+
+ return (int)dispatch_ioctl_command(command, arg);
+}
+
+#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
+
+/* Compat structure for the ORANGEFS_DEV_MAP ioctl */
+struct ORANGEFS_dev_map_desc32 {
+ compat_uptr_t ptr;
+ __s32 total_size;
+ __s32 size;
+ __s32 count;
+};
+
+static unsigned long translate_dev_map26(unsigned long args, long *error)
+{
+ struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args;
+ /*
+ * Depending on the architecture, allocate some space on the
+ * user-call-stack based on our expected layout.
+ */
+ struct ORANGEFS_dev_map_desc __user *p =
+ compat_alloc_user_space(sizeof(*p));
+ compat_uptr_t addr;
+
+ *error = 0;
+ /* get the ptr from the 32 bit user-space */
+ if (get_user(addr, &p32->ptr))
+ goto err;
+ /* try to put that into a 64-bit layout */
+ if (put_user(compat_ptr(addr), &p->ptr))
+ goto err;
+ /* copy the remaining fields */
+ if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32)))
+ goto err;
+ if (copy_in_user(&p->size, &p32->size, sizeof(__s32)))
+ goto err;
+ if (copy_in_user(&p->count, &p32->count, sizeof(__s32)))
+ goto err;
+ return (unsigned long)p;
+err:
+ *error = -EFAULT;
+ return 0;
+}
+
+/*
+ * 32 bit user-space apps' ioctl handlers when kernel modules
+ * is compiled as a 64 bit one
+ */
+static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long args)
+{
+ long ret;
+ unsigned long arg = args;
+
+ /* Check for properly constructed commands */
+ ret = check_ioctl_command(cmd);
+ if (ret < 0)
+ return ret;
+ if (cmd == ORANGEFS_DEV_MAP) {
+ /*
+ * convert the arguments to what we expect internally
+ * in kernel space
+ */
+ arg = translate_dev_map26(args, &ret);
+ if (ret < 0) {
+ gossip_err("Could not translate dev map\n");
+ return ret;
+ }
+ }
+ /* no other ioctl requires translation */
+ return dispatch_ioctl_command(cmd, arg);
+}
+
+#endif /* CONFIG_COMPAT is in .config */
+
+/* the assigned character device major number */
+static int orangefs_dev_major;
+
+/*
+ * Initialize orangefs device specific state:
+ * Must be called at module load time only
+ */
+int orangefs_dev_init(void)
+{
+ /* register orangefs-req device */
+ orangefs_dev_major = register_chrdev(0,
+ ORANGEFS_REQDEVICE_NAME,
+ &orangefs_devreq_file_operations);
+ if (orangefs_dev_major < 0) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "Failed to register /dev/%s (error %d)\n",
+ ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
+ return orangefs_dev_major;
+ }
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "*** /dev/%s character device registered ***\n",
+ ORANGEFS_REQDEVICE_NAME);
+ gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n",
+ ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
+ return 0;
+}
+
+void orangefs_dev_cleanup(void)
+{
+ unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "*** /dev/%s character device unregistered ***\n",
+ ORANGEFS_REQDEVICE_NAME);
+}
+
+static unsigned int orangefs_devreq_poll(struct file *file,
+ struct poll_table_struct *poll_table)
+{
+ int poll_revent_mask = 0;
+
+ poll_wait(file, &orangefs_request_list_waitq, poll_table);
+
+ if (!list_empty(&orangefs_request_list))
+ poll_revent_mask |= POLL_IN;
+ return poll_revent_mask;
+}
+
+const struct file_operations orangefs_devreq_file_operations = {
+ .owner = THIS_MODULE,
+ .read = orangefs_devreq_read,
+ .write_iter = orangefs_devreq_write_iter,
+ .open = orangefs_devreq_open,
+ .release = orangefs_devreq_release,
+ .unlocked_ioctl = orangefs_devreq_ioctl,
+
+#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
+ .compat_ioctl = orangefs_devreq_compat_ioctl,
+#endif
+ .poll = orangefs_devreq_poll
+};
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
new file mode 100644
index 000000000000..ba7dec40771e
--- /dev/null
+++ b/fs/orangefs/dir.c
@@ -0,0 +1,398 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+/*
+ * decode routine used by kmod to deal with the blob sent from
+ * userspace for readdirs. The blob contains zero or more of these
+ * sub-blobs:
+ * __u32 - represents length of the character string that follows.
+ * string - between 1 and ORANGEFS_NAME_MAX bytes long.
+ * padding - (if needed) to cause the __u32 plus the string to be
+ * eight byte aligned.
+ * khandle - sizeof(khandle) bytes.
+ */
+static long decode_dirents(char *ptr, size_t size,
+ struct orangefs_readdir_response_s *readdir)
+{
+ int i;
+ struct orangefs_readdir_response_s *rd =
+ (struct orangefs_readdir_response_s *) ptr;
+ char *buf = ptr;
+ int khandle_size = sizeof(struct orangefs_khandle);
+ size_t offset = offsetof(struct orangefs_readdir_response_s,
+ dirent_array);
+ /* 8 reflects eight byte alignment */
+ int smallest_blob = khandle_size + 8;
+ __u32 len;
+ int aligned_len;
+ int sizeof_u32 = sizeof(__u32);
+ long ret;
+
+ gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
+
+ /* size is = offset on empty dirs, > offset on non-empty dirs... */
+ if (size < offset) {
+ gossip_err("%s: size:%zu: offset:%zu:\n",
+ __func__,
+ size,
+ offset);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
+ gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
+ __func__,
+ size,
+ readdir->orangefs_dirent_outcount);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ readdir->token = rd->token;
+ readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
+ readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
+ sizeof(*readdir->dirent_array),
+ GFP_KERNEL);
+ if (readdir->dirent_array == NULL) {
+ gossip_err("%s: kcalloc failed.\n", __func__);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ buf += offset;
+ size -= offset;
+
+ for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
+ if (size < smallest_blob) {
+ gossip_err("%s: size:%zu: smallest_blob:%d:\n",
+ __func__,
+ size,
+ smallest_blob);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ len = *(__u32 *)buf;
+ if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
+ gossip_err("%s: len:%d:\n", __func__, len);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: size:%zu: len:%d:\n",
+ __func__,
+ size,
+ len);
+
+ readdir->dirent_array[i].d_name = buf + sizeof_u32;
+ readdir->dirent_array[i].d_length = len;
+
+ /*
+ * Calculate "aligned" length of this string and its
+ * associated __u32 descriptor.
+ */
+ aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: aligned_len:%d:\n",
+ __func__,
+ aligned_len);
+
+ /*
+ * The end of the blob should coincide with the end
+ * of the last sub-blob.
+ */
+ if (size < aligned_len + khandle_size) {
+ gossip_err("%s: ran off the end of the blob.\n",
+ __func__);
+ ret = -EINVAL;
+ goto free;
+ }
+ size -= aligned_len + khandle_size;
+
+ buf += aligned_len;
+
+ readdir->dirent_array[i].khandle =
+ *(struct orangefs_khandle *) buf;
+ buf += khandle_size;
+ }
+ ret = buf - ptr;
+ gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
+ goto out;
+
+free:
+ kfree(readdir->dirent_array);
+ readdir->dirent_array = NULL;
+
+out:
+ return ret;
+}
+
+/*
+ * Read directory entries from an instance of an open directory.
+ */
+static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+{
+ int ret = 0;
+ int buffer_index;
+ /*
+ * ptoken supports Orangefs' distributed directory logic, added
+ * in 2.9.2.
+ */
+ __u64 *ptoken = file->private_data;
+ __u64 pos = 0;
+ ino_t ino = 0;
+ struct dentry *dentry = file->f_path.dentry;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
+ int buffer_full = 0;
+ struct orangefs_readdir_response_s readdir_response;
+ void *dents_buf;
+ int i = 0;
+ int len = 0;
+ ino_t current_ino = 0;
+ char *current_entry = NULL;
+ long bytes_decoded;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld, ptoken = %llu\n",
+ __func__,
+ lld(ctx->pos),
+ llu(*ptoken));
+
+ pos = (__u64) ctx->pos;
+
+ /* are we done? */
+ if (pos == ORANGEFS_READDIR_END) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Skipping to termination path\n");
+ return 0;
+ }
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "orangefs_readdir called on %s (pos=%llu)\n",
+ dentry->d_name.name, llu(pos));
+
+ memset(&readdir_response, 0, sizeof(readdir_response));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!new_op)
+ return -ENOMEM;
+
+ /*
+ * Only the indices are shared. No memory is actually shared, but the
+ * mechanism is used.
+ */
+ new_op->uses_shared_memory = 1;
+ new_op->upcall.req.readdir.refn = orangefs_inode->refn;
+ new_op->upcall.req.readdir.max_dirent_count =
+ ORANGEFS_MAX_DIRENT_COUNT_READDIR;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: upcall.req.readdir.refn.khandle: %pU\n",
+ __func__,
+ &new_op->upcall.req.readdir.refn.khandle);
+
+ new_op->upcall.req.readdir.token = *ptoken;
+
+get_new_buffer_index:
+ buffer_index = orangefs_readdir_index_get();
+ if (buffer_index < 0) {
+ ret = buffer_index;
+ gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
+ ret);
+ goto out_free_op;
+ }
+ new_op->upcall.req.readdir.buf_index = buffer_index;
+
+ ret = service_operation(new_op,
+ "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Readdir downcall status is %d. ret:%d\n",
+ new_op->downcall.status,
+ ret);
+
+ orangefs_readdir_index_put(buffer_index);
+
+ if (ret == -EAGAIN && op_state_purged(new_op)) {
+ /* Client-core indices are invalid after it restarted. */
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: Getting new buffer_index for retry of readdir..\n",
+ __func__);
+ goto get_new_buffer_index;
+ }
+
+ if (ret == -EIO && op_state_purged(new_op)) {
+ gossip_err("%s: Client is down. Aborting readdir call.\n",
+ __func__);
+ goto out_free_op;
+ }
+
+ if (ret < 0 || new_op->downcall.status != 0) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Readdir request failed. Status:%d\n",
+ new_op->downcall.status);
+ if (ret >= 0)
+ ret = new_op->downcall.status;
+ goto out_free_op;
+ }
+
+ dents_buf = new_op->downcall.trailer_buf;
+ if (dents_buf == NULL) {
+ gossip_err("Invalid NULL buffer in readdir response\n");
+ ret = -ENOMEM;
+ goto out_free_op;
+ }
+
+ bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
+ &readdir_response);
+ if (bytes_decoded < 0) {
+ ret = bytes_decoded;
+ gossip_err("Could not decode readdir from buffer %d\n", ret);
+ goto out_vfree;
+ }
+
+ if (bytes_decoded != new_op->downcall.trailer_size) {
+ gossip_err("orangefs_readdir: # bytes decoded (%ld) "
+ "!= trailer size (%ld)\n",
+ bytes_decoded,
+ (long)new_op->downcall.trailer_size);
+ ret = -EINVAL;
+ goto out_destroy_handle;
+ }
+
+ /*
+ * orangefs doesn't actually store dot and dot-dot, but
+ * we need to have them represented.
+ */
+ if (pos == 0) {
+ ino = get_ino_from_khandle(dentry->d_inode);
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: calling dir_emit of \".\" with pos = %llu\n",
+ __func__,
+ llu(pos));
+ ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
+ pos += 1;
+ }
+
+ if (pos == 1) {
+ ino = get_parent_ino_from_dentry(dentry);
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: calling dir_emit of \"..\" with pos = %llu\n",
+ __func__,
+ llu(pos));
+ ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
+ pos += 1;
+ }
+
+ /*
+ * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
+ * to prevent "finding" dot and dot-dot on any iteration
+ * other than the first.
+ */
+ if (ctx->pos == ORANGEFS_ITERATE_NEXT)
+ ctx->pos = 0;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: dirent_outcount:%d:\n",
+ __func__,
+ readdir_response.orangefs_dirent_outcount);
+ for (i = ctx->pos;
+ i < readdir_response.orangefs_dirent_outcount;
+ i++) {
+ len = readdir_response.dirent_array[i].d_length;
+ current_entry = readdir_response.dirent_array[i].d_name;
+ current_ino = orangefs_khandle_to_ino(
+ &readdir_response.dirent_array[i].khandle);
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "calling dir_emit for %s with len %d"
+ ", ctx->pos %ld\n",
+ current_entry,
+ len,
+ (unsigned long)ctx->pos);
+ /*
+ * type is unknown. We don't return object type
+ * in the dirent_array. This leaves getdents
+ * clueless about type.
+ */
+ ret =
+ dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
+ if (!ret)
+ break;
+ ctx->pos++;
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld\n",
+ __func__,
+ lld(ctx->pos));
+
+ }
+
+ /*
+ * we ran all the way through the last batch, set up for
+ * getting another batch...
+ */
+ if (ret) {
+ *ptoken = readdir_response.token;
+ ctx->pos = ORANGEFS_ITERATE_NEXT;
+ }
+
+ /*
+ * Did we hit the end of the directory?
+ */
+ if (readdir_response.token == ORANGEFS_READDIR_END &&
+ !buffer_full) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
+ ctx->pos = ORANGEFS_READDIR_END;
+ }
+
+out_destroy_handle:
+ /* kfree(NULL) is safe */
+ kfree(readdir_response.dirent_array);
+out_vfree:
+ gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
+ vfree(dents_buf);
+out_free_op:
+ op_release(new_op);
+ gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
+ return ret;
+}
+
+static int orangefs_dir_open(struct inode *inode, struct file *file)
+{
+ __u64 *ptoken;
+
+ file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+
+ ptoken = file->private_data;
+ *ptoken = ORANGEFS_READDIR_START;
+ return 0;
+}
+
+static int orangefs_dir_release(struct inode *inode, struct file *file)
+{
+ orangefs_flush_inode(inode);
+ kfree(file->private_data);
+ return 0;
+}
+
+/** ORANGEFS implementation of VFS directory operations */
+const struct file_operations orangefs_dir_operations = {
+ .read = generic_read_dir,
+ .iterate = orangefs_readdir,
+ .open = orangefs_dir_open,
+ .release = orangefs_dir_release,
+};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
new file mode 100644
index 000000000000..66b99210f1f9
--- /dev/null
+++ b/fs/orangefs/downcall.h
@@ -0,0 +1,133 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Definitions of downcalls used in Linux kernel module.
+ */
+
+#ifndef __DOWNCALL_H
+#define __DOWNCALL_H
+
+/*
+ * Sanitized the device-client core interaction
+ * for clean 32-64 bit usage
+ */
+struct orangefs_io_response {
+ __s64 amt_complete;
+};
+
+struct orangefs_lookup_response {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_create_response {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_symlink_response {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_getattr_response {
+ struct ORANGEFS_sys_attr_s attributes;
+ char link_target[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_mkdir_response {
+ struct orangefs_object_kref refn;
+};
+
+/*
+ * duplication of some system interface structures so that I don't have
+ * to allocate extra memory
+ */
+struct orangefs_dirent {
+ char *d_name;
+ int d_length;
+ struct orangefs_khandle khandle;
+};
+
+struct orangefs_statfs_response {
+ __s64 block_size;
+ __s64 blocks_total;
+ __s64 blocks_avail;
+ __s64 files_total;
+ __s64 files_avail;
+};
+
+struct orangefs_fs_mount_response {
+ __s32 fs_id;
+ __s32 id;
+ struct orangefs_khandle root_khandle;
+};
+
+/* the getxattr response is the attribute value */
+struct orangefs_getxattr_response {
+ __s32 val_sz;
+ __s32 __pad1;
+ char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+
+/* the listxattr response is an array of attribute names */
+struct orangefs_listxattr_response {
+ __s32 returned_count;
+ __s32 __pad1;
+ __u64 token;
+ char key[ORANGEFS_MAX_XATTR_LISTLEN * ORANGEFS_MAX_XATTR_NAMELEN];
+ __s32 keylen;
+ __s32 __pad2;
+ __s32 lengths[ORANGEFS_MAX_XATTR_LISTLEN];
+};
+
+struct orangefs_param_response {
+ __s64 value;
+};
+
+#define PERF_COUNT_BUF_SIZE 4096
+struct orangefs_perf_count_response {
+ char buffer[PERF_COUNT_BUF_SIZE];
+};
+
+#define FS_KEY_BUF_SIZE 4096
+struct orangefs_fs_key_response {
+ __s32 fs_keylen;
+ __s32 __pad1;
+ char fs_key[FS_KEY_BUF_SIZE];
+};
+
+struct orangefs_downcall_s {
+ __s32 type;
+ __s32 status;
+ /* currently trailer is used only by readdir */
+ __s64 trailer_size;
+ char *trailer_buf;
+
+ union {
+ struct orangefs_io_response io;
+ struct orangefs_lookup_response lookup;
+ struct orangefs_create_response create;
+ struct orangefs_symlink_response sym;
+ struct orangefs_getattr_response getattr;
+ struct orangefs_mkdir_response mkdir;
+ struct orangefs_statfs_response statfs;
+ struct orangefs_fs_mount_response fs_mount;
+ struct orangefs_getxattr_response getxattr;
+ struct orangefs_listxattr_response listxattr;
+ struct orangefs_param_response param;
+ struct orangefs_perf_count_response perf_count;
+ struct orangefs_fs_key_response fs_key;
+ } resp;
+};
+
+struct orangefs_readdir_response_s {
+ __u64 token;
+ __u64 directory_version;
+ __u32 __pad2;
+ __u32 orangefs_dirent_outcount;
+ struct orangefs_dirent *dirent_array;
+};
+
+#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
new file mode 100644
index 000000000000..ae92795ed965
--- /dev/null
+++ b/fs/orangefs/file.c
@@ -0,0 +1,717 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS file operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+/*
+ * Copy to client-core's address space from the buffers specified
+ * by the iovec upto total_size bytes.
+ * NOTE: the iovector can either contain addresses which
+ * can futher be kernel-space or user-space addresses.
+ * or it can pointers to struct page's
+ */
+static int precopy_buffers(int buffer_index,
+ struct iov_iter *iter,
+ size_t total_size)
+{
+ int ret = 0;
+ /*
+ * copy data from application/kernel by pulling it out
+ * of the iovec.
+ */
+
+
+ if (total_size) {
+ ret = orangefs_bufmap_copy_from_iovec(iter,
+ buffer_index,
+ total_size);
+ if (ret < 0)
+ gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+ __func__,
+ (long)ret);
+ }
+
+ if (ret < 0)
+ gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+ __func__,
+ (long)ret);
+ return ret;
+}
+
+/*
+ * Copy from client-core's address space to the buffers specified
+ * by the iovec upto total_size bytes.
+ * NOTE: the iovector can either contain addresses which
+ * can futher be kernel-space or user-space addresses.
+ * or it can pointers to struct page's
+ */
+static int postcopy_buffers(int buffer_index,
+ struct iov_iter *iter,
+ size_t total_size)
+{
+ int ret = 0;
+ /*
+ * copy data to application/kernel by pushing it out to
+ * the iovec. NOTE; target buffers can be addresses or
+ * struct page pointers.
+ */
+ if (total_size) {
+ ret = orangefs_bufmap_copy_to_iovec(iter,
+ buffer_index,
+ total_size);
+ if (ret < 0)
+ gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
+ __func__,
+ (long)ret);
+ }
+ return ret;
+}
+
+/*
+ * Post and wait for the I/O upcall to finish
+ */
+static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
+ loff_t *offset, struct iov_iter *iter,
+ size_t total_size, loff_t readahead_size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ struct iov_iter saved = *iter;
+ int buffer_index = -1;
+ ssize_t ret;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
+ if (!new_op)
+ return -ENOMEM;
+
+ /* synchronous I/O */
+ new_op->upcall.req.io.readahead_size = readahead_size;
+ new_op->upcall.req.io.io_type = type;
+ new_op->upcall.req.io.refn = orangefs_inode->refn;
+
+populate_shared_memory:
+ /* get a shared buffer index */
+ buffer_index = orangefs_bufmap_get();
+ if (buffer_index < 0) {
+ ret = buffer_index;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s: orangefs_bufmap_get failure (%zd)\n",
+ __func__, ret);
+ goto out;
+ }
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): GET op %p -> buffer_index %d\n",
+ __func__,
+ handle,
+ new_op,
+ buffer_index);
+
+ new_op->uses_shared_memory = 1;
+ new_op->upcall.req.io.buf_index = buffer_index;
+ new_op->upcall.req.io.count = total_size;
+ new_op->upcall.req.io.offset = *offset;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): offset: %llu total_size: %zd\n",
+ __func__,
+ handle,
+ llu(*offset),
+ total_size);
+ /*
+ * Stage 1: copy the buffers into client-core's address space
+ * precopy_buffers only pertains to writes.
+ */
+ if (type == ORANGEFS_IO_WRITE) {
+ ret = precopy_buffers(buffer_index,
+ iter,
+ total_size);
+ if (ret < 0)
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Calling post_io_request with tag (%llu)\n",
+ __func__,
+ handle,
+ llu(new_op->tag));
+
+ /* Stage 2: Service the I/O operation */
+ ret = service_operation(new_op,
+ type == ORANGEFS_IO_WRITE ?
+ "file_write" :
+ "file_read",
+ get_interruptible_flag(inode));
+
+ /*
+ * If service_operation() returns -EAGAIN #and# the operation was
+ * purged from orangefs_request_list or htable_ops_in_progress, then
+ * we know that the client was restarted, causing the shared memory
+ * area to be wiped clean. To restart a write operation in this
+ * case, we must re-copy the data from the user's iovec to a NEW
+ * shared memory location. To restart a read operation, we must get
+ * a new shared memory location.
+ */
+ if (ret == -EAGAIN && op_state_purged(new_op)) {
+ orangefs_bufmap_put(buffer_index);
+ buffer_index = -1;
+ if (type == ORANGEFS_IO_WRITE)
+ *iter = saved;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s:going to repopulate_shared_memory.\n",
+ __func__);
+ goto populate_shared_memory;
+ }
+
+ if (ret < 0) {
+ if (ret == -EINTR) {
+ /*
+ * We can't return EINTR if any data was written,
+ * it's not POSIX. It is minimally acceptable
+ * to give a partial write, the way NFS does.
+ *
+ * It would be optimal to return all or nothing,
+ * but if a userspace write is bigger than
+ * an IO buffer, and the interrupt occurs
+ * between buffer writes, that would not be
+ * possible.
+ */
+ switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
+ /*
+ * If the op was waiting when the interrupt
+ * occurred, then the client-core did not
+ * trigger the write.
+ */
+ case OP_VFS_STATE_WAITING:
+ if (*offset == 0)
+ ret = -EINTR;
+ else
+ ret = 0;
+ break;
+ /*
+ * If the op was in progress when the interrupt
+ * occurred, then the client-core was able to
+ * trigger the write.
+ */
+ case OP_VFS_STATE_INPROGR:
+ ret = total_size;
+ break;
+ default:
+ gossip_err("%s: unexpected op state :%d:.\n",
+ __func__,
+ new_op->op_state);
+ ret = 0;
+ break;
+ }
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s: got EINTR, state:%d: %p\n",
+ __func__,
+ new_op->op_state,
+ new_op);
+ } else {
+ gossip_err("%s: error in %s handle %pU, returning %zd\n",
+ __func__,
+ type == ORANGEFS_IO_READ ?
+ "read from" : "write to",
+ handle, ret);
+ }
+ if (orangefs_cancel_op_in_progress(new_op))
+ return ret;
+
+ goto out;
+ }
+
+ /*
+ * Stage 3: Post copy buffers from client-core's address space
+ * postcopy_buffers only pertains to reads.
+ */
+ if (type == ORANGEFS_IO_READ) {
+ ret = postcopy_buffers(buffer_index,
+ iter,
+ new_op->downcall.resp.io.amt_complete);
+ if (ret < 0)
+ goto out;
+ }
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
+ __func__,
+ handle,
+ type == ORANGEFS_IO_READ ? "read" : "written",
+ (int)new_op->downcall.resp.io.amt_complete);
+
+ ret = new_op->downcall.resp.io.amt_complete;
+
+out:
+ if (buffer_index >= 0) {
+ orangefs_bufmap_put(buffer_index);
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): PUT buffer_index %d\n",
+ __func__, handle, buffer_index);
+ buffer_index = -1;
+ }
+ op_release(new_op);
+ return ret;
+}
+
+/*
+ * Common entry point for read/write/readv/writev
+ * This function will dispatch it to either the direct I/O
+ * or buffered I/O path depending on the mount options and/or
+ * augmented/extended metadata attached to the file.
+ * Note: File extended attributes override any mount options.
+ */
+static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
+ loff_t *offset, struct iov_iter *iter)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+ size_t count = iov_iter_count(iter);
+ ssize_t total_count = 0;
+ ssize_t ret = -EINVAL;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
+ __func__,
+ handle,
+ (int)count);
+
+ if (type == ORANGEFS_IO_WRITE) {
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): proceeding with offset : %llu, "
+ "size %d\n",
+ __func__,
+ handle,
+ llu(*offset),
+ (int)count);
+ }
+
+ if (count == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ while (iov_iter_count(iter)) {
+ size_t each_count = iov_iter_count(iter);
+ size_t amt_complete;
+
+ /* how much to transfer in this loop iteration */
+ if (each_count > orangefs_bufmap_size_query())
+ each_count = orangefs_bufmap_size_query();
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): size of each_count(%d)\n",
+ __func__,
+ handle,
+ (int)each_count);
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): BEFORE wait_for_io: offset is %d\n",
+ __func__,
+ handle,
+ (int)*offset);
+
+ ret = wait_for_direct_io(type, inode, offset, iter,
+ each_count, 0);
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): return from wait_for_io:%d\n",
+ __func__,
+ handle,
+ (int)ret);
+
+ if (ret < 0)
+ goto out;
+
+ *offset += ret;
+ total_count += ret;
+ amt_complete = ret;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): AFTER wait_for_io: offset is %d\n",
+ __func__,
+ handle,
+ (int)*offset);
+
+ /*
+ * if we got a short I/O operations,
+ * fall out and return what we got so far
+ */
+ if (amt_complete < each_count)
+ break;
+ } /*end while */
+
+out:
+ if (total_count > 0)
+ ret = total_count;
+ if (ret > 0) {
+ if (type == ORANGEFS_IO_READ) {
+ file_accessed(file);
+ } else {
+ SetMtimeFlag(orangefs_inode);
+ inode->i_mtime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+ }
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Value(%d) returned.\n",
+ __func__,
+ handle,
+ (int)ret);
+
+ return ret;
+}
+
+/*
+ * Read data from a specified offset in a file (referenced by inode).
+ * Data may be placed either in a user or kernel buffer.
+ */
+ssize_t orangefs_inode_read(struct inode *inode,
+ struct iov_iter *iter,
+ loff_t *offset,
+ loff_t readahead_size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ size_t count = iov_iter_count(iter);
+ size_t bufmap_size;
+ ssize_t ret = -EINVAL;
+
+ g_orangefs_stats.reads++;
+
+ bufmap_size = orangefs_bufmap_size_query();
+ if (count > bufmap_size) {
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s: count is too large (%zd/%zd)!\n",
+ __func__, count, bufmap_size);
+ return -EINVAL;
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU) %zd@%llu\n",
+ __func__,
+ &orangefs_inode->refn.khandle,
+ count,
+ llu(*offset));
+
+ ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
+ count, readahead_size);
+ if (ret > 0)
+ *offset += ret;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Value(%zd) returned.\n",
+ __func__,
+ &orangefs_inode->refn.khandle,
+ ret);
+
+ return ret;
+}
+
+static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos = *(&iocb->ki_pos);
+ ssize_t rc = 0;
+
+ BUG_ON(iocb->private);
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
+
+ g_orangefs_stats.reads++;
+
+ rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
+ iocb->ki_pos = pos;
+
+ return rc;
+}
+
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos;
+ ssize_t rc;
+
+ BUG_ON(iocb->private);
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
+
+ mutex_lock(&file->f_mapping->host->i_mutex);
+
+ /* Make sure generic_write_checks sees an up to date inode size. */
+ if (file->f_flags & O_APPEND) {
+ rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ if (rc == -ESTALE)
+ rc = -EIO;
+ if (rc) {
+ gossip_err("%s: orangefs_inode_getattr failed, "
+ "rc:%zd:.\n", __func__, rc);
+ goto out;
+ }
+ }
+
+ if (file->f_pos > i_size_read(file->f_mapping->host))
+ orangefs_i_size_write(file->f_mapping->host, file->f_pos);
+
+ rc = generic_write_checks(iocb, iter);
+
+ if (rc <= 0) {
+ gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
+ __func__, rc);
+ goto out;
+ }
+
+ /*
+ * if we are appending, generic_write_checks would have updated
+ * pos to the end of the file, so we will wait till now to set
+ * pos...
+ */
+ pos = *(&iocb->ki_pos);
+
+ rc = do_readv_writev(ORANGEFS_IO_WRITE,
+ file,
+ &pos,
+ iter);
+ if (rc < 0) {
+ gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
+ __func__, rc);
+ goto out;
+ }
+
+ iocb->ki_pos = pos;
+ g_orangefs_stats.writes++;
+
+out:
+
+ mutex_unlock(&file->f_mapping->host->i_mutex);
+ return rc;
+}
+
+/*
+ * Perform a miscellaneous operation on a file.
+ */
+static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int ret = -ENOTTY;
+ __u64 val = 0;
+ unsigned long uval;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_ioctl: called with cmd %d\n",
+ cmd);
+
+ /*
+ * we understand some general ioctls on files, such as the immutable
+ * and append flags
+ */
+ if (cmd == FS_IOC_GETFLAGS) {
+ val = 0;
+ ret = orangefs_inode_getxattr(file_inode(file),
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ "user.pvfs2.meta_hint",
+ &val, sizeof(val));
+ if (ret < 0 && ret != -ENODATA)
+ return ret;
+ else if (ret == -ENODATA)
+ val = 0;
+ uval = val;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
+ (unsigned long long)uval);
+ return put_user(uval, (int __user *)arg);
+ } else if (cmd == FS_IOC_SETFLAGS) {
+ ret = 0;
+ if (get_user(uval, (int __user *)arg))
+ return -EFAULT;
+ /*
+ * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
+ * is turned on for a file. The user is not allowed to turn
+ * on this bit, but the bit is present if the user first gets
+ * the flags and then updates the flags with some new
+ * settings. So, we ignore it in the following edit. bligon.
+ */
+ if ((uval & ~ORANGEFS_MIRROR_FL) &
+ (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
+ gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
+ return -EINVAL;
+ }
+ val = uval;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
+ (unsigned long long)val);
+ ret = orangefs_inode_setxattr(file_inode(file),
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ "user.pvfs2.meta_hint",
+ &val, sizeof(val), 0);
+ }
+
+ return ret;
+}
+
+/*
+ * Memory map a region of a file.
+ */
+static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_file_mmap: called on %s\n",
+ (file ?
+ (char *)file->f_path.dentry->d_name.name :
+ (char *)"Unknown"));
+
+ /* set the sequential readahead hint */
+ vma->vm_flags |= VM_SEQ_READ;
+ vma->vm_flags &= ~VM_RAND_READ;
+
+ /* Use readonly mmap since we cannot support writable maps. */
+ return generic_file_readonly_mmap(file, vma);
+}
+
+#define mapping_nrpages(idata) ((idata)->nrpages)
+
+/*
+ * Called to notify the module that there are no more references to
+ * this file (i.e. no processes have it open).
+ *
+ * \note Not called when each file is closed.
+ */
+static int orangefs_file_release(struct inode *inode, struct file *file)
+{
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_file_release: called on %s\n",
+ file->f_path.dentry->d_name.name);
+
+ orangefs_flush_inode(inode);
+
+ /*
+ * remove all associated inode pages from the page cache and mmap
+ * readahead cache (if any); this forces an expensive refresh of
+ * data for the next caller of mmap (or 'get_block' accesses)
+ */
+ if (file->f_path.dentry->d_inode &&
+ file->f_path.dentry->d_inode->i_mapping &&
+ mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
+ truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
+ 0);
+ return 0;
+}
+
+/*
+ * Push all data for a specific file onto permanent storage.
+ */
+static int orangefs_fsync(struct file *file,
+ loff_t start,
+ loff_t end,
+ int datasync)
+{
+ int ret = -EINVAL;
+ struct orangefs_inode_s *orangefs_inode =
+ ORANGEFS_I(file->f_path.dentry->d_inode);
+ struct orangefs_kernel_op_s *new_op = NULL;
+
+ /* required call */
+ filemap_write_and_wait_range(file->f_mapping, start, end);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.fsync.refn = orangefs_inode->refn;
+
+ ret = service_operation(new_op,
+ "orangefs_fsync",
+ get_interruptible_flag(file->f_path.dentry->d_inode));
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_fsync got return value of %d\n",
+ ret);
+
+ op_release(new_op);
+
+ orangefs_flush_inode(file->f_path.dentry->d_inode);
+ return ret;
+}
+
+/*
+ * Change the file pointer position for an instance of an open file.
+ *
+ * \note If .llseek is overriden, we must acquire lock as described in
+ * Documentation/filesystems/Locking.
+ *
+ * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
+ * require much changes to the FS
+ */
+static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ int ret = -EINVAL;
+ struct inode *inode = file_inode(file);
+
+ if (origin == SEEK_END) {
+ /*
+ * revalidate the inode's file size.
+ * NOTE: We are only interested in file size here,
+ * so we set mask accordingly.
+ */
+ ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ if (ret == -ESTALE)
+ ret = -EIO;
+ if (ret) {
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s:%s:%d calling make bad inode\n",
+ __FILE__,
+ __func__,
+ __LINE__);
+ return ret;
+ }
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_file_llseek: offset is %ld | origin is %d"
+ " | inode size is %lu\n",
+ (long)offset,
+ origin,
+ (unsigned long)i_size_read(inode));
+
+ return generic_file_llseek(file, offset, origin);
+}
+
+/*
+ * Support local locks (locks that only this kernel knows about)
+ * if Orangefs was mounted -o local_lock.
+ */
+static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ int rc = -EINVAL;
+
+ if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
+ if (cmd == F_GETLK) {
+ rc = 0;
+ posix_test_lock(filp, fl);
+ } else {
+ rc = posix_lock_file(filp, fl, NULL);
+ }
+ }
+
+ return rc;
+}
+
+/** ORANGEFS implementation of VFS file operations */
+const struct file_operations orangefs_file_operations = {
+ .llseek = orangefs_file_llseek,
+ .read_iter = orangefs_file_read_iter,
+ .write_iter = orangefs_file_write_iter,
+ .lock = orangefs_lock,
+ .unlocked_ioctl = orangefs_ioctl,
+ .mmap = orangefs_file_mmap,
+ .open = generic_file_open,
+ .release = orangefs_file_release,
+ .fsync = orangefs_fsync,
+};
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
new file mode 100644
index 000000000000..2382e267b49e
--- /dev/null
+++ b/fs/orangefs/inode.c
@@ -0,0 +1,475 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS inode operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+static int read_one_page(struct page *page)
+{
+ int ret;
+ int max_block;
+ ssize_t bytes_read = 0;
+ struct inode *inode = page->mapping->host;
+ const __u32 blocksize = PAGE_CACHE_SIZE; /* inode->i_blksize */
+ const __u32 blockbits = PAGE_CACHE_SHIFT; /* inode->i_blkbits */
+ struct iov_iter to;
+ struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
+
+ iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_readpage called with page %p\n",
+ page);
+
+ max_block = ((inode->i_size / blocksize) + 1);
+
+ if (page->index < max_block) {
+ loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
+
+ bytes_read = orangefs_inode_read(inode,
+ &to,
+ &blockptr_offset,
+ inode->i_size);
+ }
+ /* this will only zero remaining unread portions of the page data */
+ iov_iter_zero(~0U, &to);
+ /* takes care of potential aliasing */
+ flush_dcache_page(page);
+ if (bytes_read < 0) {
+ ret = bytes_read;
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ ret = 0;
+ }
+ /* unlock the page after the ->readpage() routine completes */
+ unlock_page(page);
+ return ret;
+}
+
+static int orangefs_readpage(struct file *file, struct page *page)
+{
+ return read_one_page(page);
+}
+
+static int orangefs_readpages(struct file *file,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned nr_pages)
+{
+ int page_idx;
+ int ret;
+
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
+
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page;
+
+ page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ if (!add_to_page_cache(page,
+ mapping,
+ page->index,
+ GFP_KERNEL)) {
+ ret = read_one_page(page);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "failure adding page to cache, read_one_page returned: %d\n",
+ ret);
+ } else {
+ page_cache_release(page);
+ }
+ }
+ BUG_ON(!list_empty(pages));
+ return 0;
+}
+
+static void orangefs_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_invalidatepage called on page %p "
+ "(offset is %u)\n",
+ page,
+ offset);
+
+ ClearPageUptodate(page);
+ ClearPageMappedToDisk(page);
+ return;
+
+}
+
+static int orangefs_releasepage(struct page *page, gfp_t foo)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_releasepage called on page %p\n",
+ page);
+ return 0;
+}
+
+/*
+ * Having a direct_IO entry point in the address_space_operations
+ * struct causes the kernel to allows us to use O_DIRECT on
+ * open. Nothing will ever call this thing, but in the future we
+ * will need to be able to use O_DIRECT on open in order to support
+ * AIO. Modeled after NFS, they do this too.
+ */
+/*
+ * static ssize_t orangefs_direct_IO(int rw,
+ * struct kiocb *iocb,
+ * struct iov_iter *iter,
+ * loff_t offset)
+ *{
+ * gossip_debug(GOSSIP_INODE_DEBUG,
+ * "orangefs_direct_IO: %s\n",
+ * iocb->ki_filp->f_path.dentry->d_name.name);
+ *
+ * return -EINVAL;
+ *}
+ */
+
+struct backing_dev_info orangefs_backing_dev_info = {
+ .name = "orangefs",
+ .ra_pages = 0,
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+/** ORANGEFS2 implementation of address space operations */
+const struct address_space_operations orangefs_address_operations = {
+ .readpage = orangefs_readpage,
+ .readpages = orangefs_readpages,
+ .invalidatepage = orangefs_invalidatepage,
+ .releasepage = orangefs_releasepage,
+/* .direct_IO = orangefs_direct_IO */
+};
+
+static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ loff_t orig_size;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
+ __func__,
+ get_khandle_from_ino(inode),
+ &orangefs_inode->refn.khandle,
+ orangefs_inode->refn.fs_id,
+ iattr->ia_size);
+
+ /* Ensure that we have a up to date size, so we know if it changed. */
+ ret = orangefs_inode_getattr(inode, 0, 1);
+ if (ret == -ESTALE)
+ ret = -EIO;
+ if (ret) {
+ gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
+ __func__, ret);
+ return ret;
+ }
+ orig_size = i_size_read(inode);
+
+ truncate_setsize(inode, iattr->ia_size);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.truncate.refn = orangefs_inode->refn;
+ new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+
+ /*
+ * the truncate has no downcall members to retrieve, but
+ * the status value tells us if it went through ok or not
+ */
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs: orangefs_truncate got return value of %d\n",
+ ret);
+
+ op_release(new_op);
+
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Only change the c/mtime if we are changing the size or we are
+ * explicitly asked to change it. This handles the semantic difference
+ * between truncate() and ftruncate() as implemented in the VFS.
+ *
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (orig_size != i_size_read(inode) &&
+ !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
+ iattr->ia_ctime = iattr->ia_mtime =
+ current_fs_time(inode->i_sb);
+ iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+ }
+
+ return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret = -EINVAL;
+ struct inode *inode = dentry->d_inode;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_setattr: called on %s\n",
+ dentry->d_name.name);
+
+ ret = inode_change_ok(inode, iattr);
+ if (ret)
+ goto out;
+
+ if ((iattr->ia_valid & ATTR_SIZE) &&
+ iattr->ia_size != i_size_read(inode)) {
+ ret = orangefs_setattr_size(inode, iattr);
+ if (ret)
+ goto out;
+ }
+
+ setattr_copy(inode, iattr);
+ mark_inode_dirty(inode);
+
+ ret = orangefs_inode_setattr(inode, iattr);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_setattr: inode_setattr returned %d\n",
+ ret);
+
+ if (!ret && (iattr->ia_valid & ATTR_MODE))
+ /* change mod on a file that has ACLs */
+ ret = posix_acl_chmod(inode, inode->i_mode);
+
+out:
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret);
+ return ret;
+}
+
+/*
+ * Obtain attributes of an object given a dentry
+ */
+int orangefs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct kstat *kstat)
+{
+ int ret = -ENOENT;
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_inode_s *orangefs_inode = NULL;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_getattr: called on %s\n",
+ dentry->d_name.name);
+
+ ret = orangefs_inode_getattr(inode, 0, 1);
+ if (ret == 0) {
+ generic_fillattr(inode, kstat);
+
+ /* override block size reported to stat */
+ orangefs_inode = ORANGEFS_I(inode);
+ kstat->blksize = orangefs_inode->blksize;
+ }
+ return ret;
+}
+
+int orangefs_permission(struct inode *inode, int mask)
+{
+ int ret;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
+
+ /* Make sure the permission (and other common attrs) are up to date. */
+ ret = orangefs_inode_getattr(inode, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return generic_permission(inode, mask);
+}
+
+/* ORANGEDS2 implementation of VFS inode operations for files */
+struct inode_operations orangefs_file_inode_operations = {
+ .get_acl = orangefs_get_acl,
+ .set_acl = orangefs_set_acl,
+ .setattr = orangefs_setattr,
+ .getattr = orangefs_getattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = orangefs_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = orangefs_permission,
+};
+
+static int orangefs_init_iops(struct inode *inode)
+{
+ inode->i_mapping->a_ops = &orangefs_address_operations;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &orangefs_file_inode_operations;
+ inode->i_fop = &orangefs_file_operations;
+ inode->i_blkbits = PAGE_CACHE_SHIFT;
+ break;
+ case S_IFLNK:
+ inode->i_op = &orangefs_symlink_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &orangefs_dir_inode_operations;
+ inode->i_fop = &orangefs_dir_operations;
+ break;
+ default:
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s: unsupported mode\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type
+ * that will be used as a hash-index from where the handle will
+ * be searched for in the VFS hash table of inodes.
+ */
+static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
+{
+ if (!ref)
+ return 0;
+ return orangefs_khandle_to_ino(&(ref->khandle));
+}
+
+/*
+ * Called to set up an inode from iget5_locked.
+ */
+static int orangefs_set_inode(struct inode *inode, void *data)
+{
+ struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+ ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
+ ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+ return 0;
+}
+
+/*
+ * Called to determine if handles match.
+ */
+static int orangefs_test_inode(struct inode *inode, void *data)
+{
+ struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+ struct orangefs_inode_s *orangefs_inode = NULL;
+
+ orangefs_inode = ORANGEFS_I(inode);
+ return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle))
+ && orangefs_inode->refn.fs_id == ref->fs_id);
+}
+
+/*
+ * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS
+ * file handle.
+ *
+ * @sb: the file system super block instance.
+ * @ref: The ORANGEFS object for which we are trying to locate an inode structure.
+ */
+struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref)
+{
+ struct inode *inode = NULL;
+ unsigned long hash;
+ int error;
+
+ hash = orangefs_handle_hash(ref);
+ inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref);
+ if (!inode || !(inode->i_state & I_NEW))
+ return inode;
+
+ error = orangefs_inode_getattr(inode, 1, 0);
+ if (error) {
+ iget_failed(inode);
+ return ERR_PTR(error);
+ }
+
+ inode->i_ino = hash; /* needed for stat etc */
+ orangefs_init_iops(inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
+ &ref->khandle,
+ ref->fs_id,
+ hash,
+ inode->i_ino);
+
+ return inode;
+}
+
+/*
+ * Allocate an inode for a newly created file and insert it into the inode hash.
+ */
+struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
+ int mode, dev_t dev, struct orangefs_object_kref *ref)
+{
+ unsigned long hash = orangefs_handle_hash(ref);
+ struct inode *inode;
+ int error;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
+ __func__,
+ sb,
+ MAJOR(dev),
+ MINOR(dev),
+ mode);
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ orangefs_set_inode(inode, ref);
+ inode->i_ino = hash; /* needed for stat etc */
+
+ error = orangefs_inode_getattr(inode, 1, 0);
+ if (error)
+ goto out_iput;
+
+ orangefs_init_iops(inode);
+
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_size = PAGE_CACHE_SIZE;
+ inode->i_rdev = dev;
+
+ error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
+ if (error < 0)
+ goto out_iput;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "Initializing ACL's for inode %pU\n",
+ get_khandle_from_ino(inode));
+ orangefs_init_acl(inode, dir);
+ return inode;
+
+out_iput:
+ iput(inode);
+ return ERR_PTR(error);
+}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
new file mode 100644
index 000000000000..5a60c508af4e
--- /dev/null
+++ b/fs/orangefs/namei.c
@@ -0,0 +1,462 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS namei operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/*
+ * Get a newly allocated inode to go with a negative dentry.
+ */
+static int orangefs_create(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ bool exclusive)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n",
+ __func__,
+ dentry->d_name.name);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_CREATE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.create.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.create.attributes,
+ ORANGEFS_TYPE_METAFILE, mode);
+
+ strncpy(new_op->upcall.req.create.d_name,
+ dentry->d_name.name, ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
+ __func__,
+ dentry->d_name.name,
+ &new_op->downcall.resp.create.refn.khandle,
+ new_op->downcall.resp.create.refn.fs_id,
+ new_op,
+ ret);
+
+ if (ret < 0)
+ goto out;
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
+ &new_op->downcall.resp.create.refn);
+ if (IS_ERR(inode)) {
+ gossip_err("%s: Failed to allocate inode for file :%s:\n",
+ __func__,
+ dentry->d_name.name);
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: Assigned inode :%pU: for file :%s:\n",
+ __func__,
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: dentry instantiated for %s\n",
+ __func__,
+ dentry->d_name.name);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ ret = 0;
+out:
+ op_release(new_op);
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: %s: returning %d\n",
+ __func__,
+ dentry->d_name.name,
+ ret);
+ return ret;
+}
+
+/*
+ * Attempt to resolve an object name (dentry->d_name), parent handle, and
+ * fsid into a handle for the object.
+ */
+static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ struct dentry *res;
+ int ret = -EINVAL;
+
+ /*
+ * in theory we could skip a lookup here (if the intent is to
+ * create) in order to avoid a potentially failed lookup, but
+ * leaving it in can skip a valid lookup and try to create a file
+ * that already exists (e.g. the vfs already handles checking for
+ * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
+ * in the create path)
+ */
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
+ __func__, dentry->d_name.name);
+
+ if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1))
+ return ERR_PTR(-ENAMETOOLONG);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+
+ new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ &parent->refn.khandle);
+ new_op->upcall.req.lookup.parent_refn = parent->refn;
+
+ strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: doing lookup on %s under %pU,%d\n",
+ __func__,
+ new_op->upcall.req.lookup.d_name,
+ &new_op->upcall.req.lookup.parent_refn.khandle,
+ new_op->upcall.req.lookup.parent_refn.fs_id);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Lookup Got %pU, fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.lookup.refn.khandle,
+ new_op->downcall.resp.lookup.refn.fs_id,
+ ret);
+
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ /*
+ * if no inode was found, add a negative dentry to
+ * dcache anyway; if we don't, we don't hold expected
+ * lookup semantics and we most noticeably break
+ * during directory renames.
+ *
+ * however, if the operation failed or exited, do not
+ * add the dentry (e.g. in the case that a touch is
+ * issued on a file that already exists that was
+ * interrupted during this lookup -- no need to add
+ * another negative dentry for an existing file)
+ */
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "orangefs_lookup: Adding *negative* dentry "
+ "%p for %s\n",
+ dentry,
+ dentry->d_name.name);
+
+ d_add(dentry, NULL);
+ res = NULL;
+ goto out;
+ }
+
+ /* must be a non-recoverable error */
+ res = ERR_PTR(ret);
+ goto out;
+ }
+
+ inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+ if (IS_ERR(inode)) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "error %ld from iget\n", PTR_ERR(inode));
+ res = ERR_CAST(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s:%s:%d "
+ "Found good inode [%lu] with count [%d]\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ inode->i_ino,
+ (int)atomic_read(&inode->i_count));
+
+ /* update dentry/inode pair into dcache */
+ res = d_splice_alias(inode, dentry);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Lookup success (inode ct = %d)\n",
+ (int)atomic_read(&inode->i_count));
+out:
+ op_release(new_op);
+ return res;
+}
+
+/* return 0 on success; non-zero otherwise */
+static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: called on %s\n"
+ " (inode %pU): Parent is %pU | fs_id %d\n",
+ __func__,
+ dentry->d_name.name,
+ get_khandle_from_ino(inode),
+ &parent->refn.khandle,
+ parent->refn.fs_id);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_REMOVE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.remove.parent_refn = parent->refn;
+ strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, "orangefs_unlink",
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: service_operation returned:%d:\n",
+ __func__,
+ ret);
+
+ op_release(new_op);
+
+ if (!ret) {
+ drop_nlink(inode);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ }
+ return ret;
+}
+
+static int orangefs_symlink(struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ int mode = 755;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+ if (!symname)
+ return -EINVAL;
+
+ if (strlen(symname)+1 > ORANGEFS_NAME_MAX)
+ return -ENAMETOOLONG;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_SYMLINK);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.sym.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
+ ORANGEFS_TYPE_SYMLINK,
+ mode);
+
+ strncpy(new_op->upcall.req.sym.entry_name,
+ dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+ strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Symlink Got ORANGEFS handle %pU on fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.sym.refn.khandle,
+ new_op->downcall.resp.sym.refn.fs_id, ret);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
+ &new_op->downcall.resp.sym.refn);
+ if (IS_ERR(inode)) {
+ gossip_err
+ ("*** Failed to allocate orangefs symlink inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Assigned symlink inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Inode (Symlink) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ ret = 0;
+out:
+ op_release(new_op);
+ return ret;
+}
+
+static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ int ret;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.mkdir.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
+ ORANGEFS_TYPE_DIRECTORY, mode);
+
+ strncpy(new_op->upcall.req.mkdir.d_name,
+ dentry->d_name.name, ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Mkdir Got ORANGEFS handle %pU on fsid %d\n",
+ &new_op->downcall.resp.mkdir.refn.khandle,
+ new_op->downcall.resp.mkdir.refn.fs_id);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
+ &new_op->downcall.resp.mkdir.refn);
+ if (IS_ERR(inode)) {
+ gossip_err("*** Failed to allocate orangefs dir inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Assigned dir inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Inode (Directory) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ /*
+ * NOTE: we have no good way to keep nlink consistent for directories
+ * across clients; keep constant at 1.
+ */
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+out:
+ op_release(new_op);
+ return ret;
+}
+
+static int orangefs_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
+ old_dentry->d_parent->d_name.name,
+ old_dentry->d_name.name,
+ new_dentry->d_parent->d_name.name,
+ new_dentry->d_name.name,
+ d_count(new_dentry));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
+ if (!new_op)
+ return -EINVAL;
+
+ new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn;
+ new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn;
+
+ strncpy(new_op->upcall.req.rename.d_old_name,
+ old_dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+ strncpy(new_op->upcall.req.rename.d_new_name,
+ new_dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op,
+ "orangefs_rename",
+ get_interruptible_flag(old_dentry->d_inode));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "orangefs_rename: got downcall status %d\n",
+ ret);
+
+ if (new_dentry->d_inode)
+ new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+ op_release(new_op);
+ return ret;
+}
+
+/* ORANGEFS implementation of VFS inode operations for directories */
+struct inode_operations orangefs_dir_inode_operations = {
+ .lookup = orangefs_lookup,
+ .get_acl = orangefs_get_acl,
+ .set_acl = orangefs_set_acl,
+ .create = orangefs_create,
+ .unlink = orangefs_unlink,
+ .symlink = orangefs_symlink,
+ .mkdir = orangefs_mkdir,
+ .rmdir = orangefs_unlink,
+ .rename = orangefs_rename,
+ .setattr = orangefs_setattr,
+ .getattr = orangefs_getattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = orangefs_listxattr,
+ .permission = orangefs_permission,
+};
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
new file mode 100644
index 000000000000..1f8acc9f9a88
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -0,0 +1,556 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+struct slot_map {
+ int c;
+ wait_queue_head_t q;
+ int count;
+ unsigned long *map;
+};
+
+static struct slot_map rw_map = {
+ .c = -1,
+ .q = __WAIT_QUEUE_HEAD_INITIALIZER(rw_map.q)
+};
+static struct slot_map readdir_map = {
+ .c = -1,
+ .q = __WAIT_QUEUE_HEAD_INITIALIZER(readdir_map.q)
+};
+
+
+static void install(struct slot_map *m, int count, unsigned long *map)
+{
+ spin_lock(&m->q.lock);
+ m->c = m->count = count;
+ m->map = map;
+ wake_up_all_locked(&m->q);
+ spin_unlock(&m->q.lock);
+}
+
+static void mark_killed(struct slot_map *m)
+{
+ spin_lock(&m->q.lock);
+ m->c -= m->count + 1;
+ spin_unlock(&m->q.lock);
+}
+
+static void run_down(struct slot_map *m)
+{
+ DEFINE_WAIT(wait);
+ spin_lock(&m->q.lock);
+ if (m->c != -1) {
+ for (;;) {
+ if (likely(list_empty(&wait.task_list)))
+ __add_wait_queue_tail(&m->q, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ if (m->c == -1)
+ break;
+
+ spin_unlock(&m->q.lock);
+ schedule();
+ spin_lock(&m->q.lock);
+ }
+ __remove_wait_queue(&m->q, &wait);
+ __set_current_state(TASK_RUNNING);
+ }
+ m->map = NULL;
+ spin_unlock(&m->q.lock);
+}
+
+static void put(struct slot_map *m, int slot)
+{
+ int v;
+ spin_lock(&m->q.lock);
+ __clear_bit(slot, m->map);
+ v = ++m->c;
+ if (unlikely(v == 1)) /* no free slots -> one free slot */
+ wake_up_locked(&m->q);
+ else if (unlikely(v == -1)) /* finished dying */
+ wake_up_all_locked(&m->q);
+ spin_unlock(&m->q.lock);
+}
+
+static int wait_for_free(struct slot_map *m)
+{
+ long left = slot_timeout_secs * HZ;
+ DEFINE_WAIT(wait);
+
+ do {
+ long n = left, t;
+ if (likely(list_empty(&wait.task_list)))
+ __add_wait_queue_tail_exclusive(&m->q, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (m->c > 0)
+ break;
+
+ if (m->c < 0) {
+ /* we are waiting for map to be installed */
+ /* it would better be there soon, or we go away */
+ if (n > ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ)
+ n = ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ;
+ }
+ spin_unlock(&m->q.lock);
+ t = schedule_timeout(n);
+ spin_lock(&m->q.lock);
+ if (unlikely(!t) && n != left && m->c < 0)
+ left = t;
+ else
+ left = t + (left - n);
+ if (unlikely(signal_pending(current)))
+ left = -EINTR;
+ } while (left > 0);
+
+ if (!list_empty(&wait.task_list))
+ list_del(&wait.task_list);
+ else if (left <= 0 && waitqueue_active(&m->q))
+ __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
+ __set_current_state(TASK_RUNNING);
+
+ if (likely(left > 0))
+ return 0;
+
+ return left < 0 ? -EINTR : -ETIMEDOUT;
+}
+
+static int get(struct slot_map *m)
+{
+ int res = 0;
+ spin_lock(&m->q.lock);
+ if (unlikely(m->c <= 0))
+ res = wait_for_free(m);
+ if (likely(!res)) {
+ m->c--;
+ res = find_first_zero_bit(m->map, m->count);
+ __set_bit(res, m->map);
+ }
+ spin_unlock(&m->q.lock);
+ return res;
+}
+
+/* used to describe mapped buffers */
+struct orangefs_bufmap_desc {
+ void *uaddr; /* user space address pointer */
+ struct page **page_array; /* array of mapped pages */
+ int array_count; /* size of above arrays */
+ struct list_head list_link;
+};
+
+static struct orangefs_bufmap {
+ int desc_size;
+ int desc_shift;
+ int desc_count;
+ int total_size;
+ int page_count;
+
+ struct page **page_array;
+ struct orangefs_bufmap_desc *desc_array;
+
+ /* array to track usage of buffer descriptors */
+ unsigned long *buffer_index_array;
+
+ /* array to track usage of buffer descriptors for readdir */
+#define N DIV_ROUND_UP(ORANGEFS_READDIR_DEFAULT_DESC_COUNT, BITS_PER_LONG)
+ unsigned long readdir_index_array[N];
+#undef N
+} *__orangefs_bufmap;
+
+static DEFINE_SPINLOCK(orangefs_bufmap_lock);
+
+static void
+orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
+{
+ int i;
+
+ for (i = 0; i < bufmap->page_count; i++)
+ page_cache_release(bufmap->page_array[i]);
+}
+
+static void
+orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
+{
+ kfree(bufmap->page_array);
+ kfree(bufmap->desc_array);
+ kfree(bufmap->buffer_index_array);
+ kfree(bufmap);
+}
+
+/*
+ * XXX: Can the size and shift change while the caller gives up the
+ * XXX: lock between calling this and doing something useful?
+ */
+
+int orangefs_bufmap_size_query(void)
+{
+ struct orangefs_bufmap *bufmap;
+ int size = 0;
+ spin_lock(&orangefs_bufmap_lock);
+ bufmap = __orangefs_bufmap;
+ if (bufmap)
+ size = bufmap->desc_size;
+ spin_unlock(&orangefs_bufmap_lock);
+ return size;
+}
+
+int orangefs_bufmap_shift_query(void)
+{
+ struct orangefs_bufmap *bufmap;
+ int shift = 0;
+ spin_lock(&orangefs_bufmap_lock);
+ bufmap = __orangefs_bufmap;
+ if (bufmap)
+ shift = bufmap->desc_shift;
+ spin_unlock(&orangefs_bufmap_lock);
+ return shift;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
+
+/*
+ * orangefs_get_bufmap_init
+ *
+ * If bufmap_init is 1, then the shared memory system, including the
+ * buffer_index_array, is available. Otherwise, it is not.
+ *
+ * returns the value of bufmap_init
+ */
+int orangefs_get_bufmap_init(void)
+{
+ return __orangefs_bufmap ? 1 : 0;
+}
+
+
+static struct orangefs_bufmap *
+orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
+{
+ struct orangefs_bufmap *bufmap;
+
+ bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL);
+ if (!bufmap)
+ goto out;
+
+ bufmap->total_size = user_desc->total_size;
+ bufmap->desc_count = user_desc->count;
+ bufmap->desc_size = user_desc->size;
+ bufmap->desc_shift = ilog2(bufmap->desc_size);
+
+ bufmap->buffer_index_array =
+ kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL);
+ if (!bufmap->buffer_index_array) {
+ gossip_err("orangefs: could not allocate %d buffer indices\n",
+ bufmap->desc_count);
+ goto out_free_bufmap;
+ }
+
+ bufmap->desc_array =
+ kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc),
+ GFP_KERNEL);
+ if (!bufmap->desc_array) {
+ gossip_err("orangefs: could not allocate %d descriptors\n",
+ bufmap->desc_count);
+ goto out_free_index_array;
+ }
+
+ bufmap->page_count = bufmap->total_size / PAGE_SIZE;
+
+ /* allocate storage to track our page mappings */
+ bufmap->page_array =
+ kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!bufmap->page_array)
+ goto out_free_desc_array;
+
+ return bufmap;
+
+out_free_desc_array:
+ kfree(bufmap->desc_array);
+out_free_index_array:
+ kfree(bufmap->buffer_index_array);
+out_free_bufmap:
+ kfree(bufmap);
+out:
+ return NULL;
+}
+
+static int
+orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
+ struct ORANGEFS_dev_map_desc *user_desc)
+{
+ int pages_per_desc = bufmap->desc_size / PAGE_SIZE;
+ int offset = 0, ret, i;
+
+ /* map the pages */
+ ret = get_user_pages_fast((unsigned long)user_desc->ptr,
+ bufmap->page_count, 1, bufmap->page_array);
+
+ if (ret < 0)
+ return ret;
+
+ if (ret != bufmap->page_count) {
+ gossip_err("orangefs error: asked for %d pages, only got %d.\n",
+ bufmap->page_count, ret);
+
+ for (i = 0; i < ret; i++) {
+ SetPageError(bufmap->page_array[i]);
+ page_cache_release(bufmap->page_array[i]);
+ }
+ return -ENOMEM;
+ }
+
+ /*
+ * ideally we want to get kernel space pointers for each page, but
+ * we can't kmap that many pages at once if highmem is being used.
+ * so instead, we just kmap/kunmap the page address each time the
+ * kaddr is needed.
+ */
+ for (i = 0; i < bufmap->page_count; i++)
+ flush_dcache_page(bufmap->page_array[i]);
+
+ /* build a list of available descriptors */
+ for (offset = 0, i = 0; i < bufmap->desc_count; i++) {
+ bufmap->desc_array[i].page_array = &bufmap->page_array[offset];
+ bufmap->desc_array[i].array_count = pages_per_desc;
+ bufmap->desc_array[i].uaddr =
+ (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE));
+ offset += pages_per_desc;
+ }
+
+ return 0;
+}
+
+/*
+ * orangefs_bufmap_initialize()
+ *
+ * initializes the mapped buffer interface
+ *
+ * returns 0 on success, -errno on failure
+ */
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc)
+{
+ struct orangefs_bufmap *bufmap;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "orangefs_bufmap_initialize: called (ptr ("
+ "%p) sz (%d) cnt(%d).\n",
+ user_desc->ptr,
+ user_desc->size,
+ user_desc->count);
+
+ /*
+ * sanity check alignment and size of buffer that caller wants to
+ * work with
+ */
+ if (PAGE_ALIGN((unsigned long)user_desc->ptr) !=
+ (unsigned long)user_desc->ptr) {
+ gossip_err("orangefs error: memory alignment (front). %p\n",
+ user_desc->ptr);
+ goto out;
+ }
+
+ if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size))
+ != (unsigned long)(user_desc->ptr + user_desc->total_size)) {
+ gossip_err("orangefs error: memory alignment (back).(%p + %d)\n",
+ user_desc->ptr,
+ user_desc->total_size);
+ goto out;
+ }
+
+ if (user_desc->total_size != (user_desc->size * user_desc->count)) {
+ gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n",
+ user_desc->total_size,
+ user_desc->size,
+ user_desc->count);
+ goto out;
+ }
+
+ if ((user_desc->size % PAGE_SIZE) != 0) {
+ gossip_err("orangefs error: bufmap size not page size divisible (%d).\n",
+ user_desc->size);
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ bufmap = orangefs_bufmap_alloc(user_desc);
+ if (!bufmap)
+ goto out;
+
+ ret = orangefs_bufmap_map(bufmap, user_desc);
+ if (ret)
+ goto out_free_bufmap;
+
+
+ spin_lock(&orangefs_bufmap_lock);
+ if (__orangefs_bufmap) {
+ spin_unlock(&orangefs_bufmap_lock);
+ gossip_err("orangefs: error: bufmap already initialized.\n");
+ ret = -EINVAL;
+ goto out_unmap_bufmap;
+ }
+ __orangefs_bufmap = bufmap;
+ install(&rw_map,
+ bufmap->desc_count,
+ bufmap->buffer_index_array);
+ install(&readdir_map,
+ ORANGEFS_READDIR_DEFAULT_DESC_COUNT,
+ bufmap->readdir_index_array);
+ spin_unlock(&orangefs_bufmap_lock);
+
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "orangefs_bufmap_initialize: exiting normally\n");
+ return 0;
+
+out_unmap_bufmap:
+ orangefs_bufmap_unmap(bufmap);
+out_free_bufmap:
+ orangefs_bufmap_free(bufmap);
+out:
+ return ret;
+}
+
+/*
+ * orangefs_bufmap_finalize()
+ *
+ * shuts down the mapped buffer interface and releases any resources
+ * associated with it
+ *
+ * no return value
+ */
+void orangefs_bufmap_finalize(void)
+{
+ struct orangefs_bufmap *bufmap = __orangefs_bufmap;
+ if (!bufmap)
+ return;
+ gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n");
+ mark_killed(&rw_map);
+ mark_killed(&readdir_map);
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "orangefs_bufmap_finalize: exiting normally\n");
+}
+
+void orangefs_bufmap_run_down(void)
+{
+ struct orangefs_bufmap *bufmap = __orangefs_bufmap;
+ if (!bufmap)
+ return;
+ run_down(&rw_map);
+ run_down(&readdir_map);
+ spin_lock(&orangefs_bufmap_lock);
+ __orangefs_bufmap = NULL;
+ spin_unlock(&orangefs_bufmap_lock);
+ orangefs_bufmap_unmap(bufmap);
+ orangefs_bufmap_free(bufmap);
+}
+
+/*
+ * orangefs_bufmap_get()
+ *
+ * gets a free mapped buffer descriptor, will sleep until one becomes
+ * available if necessary
+ *
+ * returns slot on success, -errno on failure
+ */
+int orangefs_bufmap_get(void)
+{
+ return get(&rw_map);
+}
+
+/*
+ * orangefs_bufmap_put()
+ *
+ * returns a mapped buffer descriptor to the collection
+ *
+ * no return value
+ */
+void orangefs_bufmap_put(int buffer_index)
+{
+ put(&rw_map, buffer_index);
+}
+
+/*
+ * orangefs_readdir_index_get()
+ *
+ * gets a free descriptor, will sleep until one becomes
+ * available if necessary.
+ * Although the readdir buffers are not mapped into kernel space
+ * we could do that at a later point of time. Regardless, these
+ * indices are used by the client-core.
+ *
+ * returns slot on success, -errno on failure
+ */
+int orangefs_readdir_index_get(void)
+{
+ return get(&readdir_map);
+}
+
+void orangefs_readdir_index_put(int buffer_index)
+{
+ put(&readdir_map, buffer_index);
+}
+
+/*
+ * we've been handed an iovec, we need to copy it to
+ * the shared memory descriptor at "buffer_index".
+ */
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size)
+{
+ struct orangefs_bufmap_desc *to;
+ int i;
+
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "%s: buffer_index:%d: size:%zu:\n",
+ __func__, buffer_index, size);
+
+ to = &__orangefs_bufmap->desc_array[buffer_index];
+ for (i = 0; size; i++) {
+ struct page *page = to->page_array[i];
+ size_t n = size;
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ n = copy_page_from_iter(page, 0, n, iter);
+ if (!n)
+ return -EFAULT;
+ size -= n;
+ }
+ return 0;
+
+}
+
+/*
+ * we've been handed an iovec, we need to fill it from
+ * the shared memory descriptor at "buffer_index".
+ */
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size)
+{
+ struct orangefs_bufmap_desc *from;
+ int i;
+
+ from = &__orangefs_bufmap->desc_array[buffer_index];
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "%s: buffer_index:%d: size:%zu:\n",
+ __func__, buffer_index, size);
+
+
+ for (i = 0; size; i++) {
+ struct page *page = from->page_array[i];
+ size_t n = size;
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ n = copy_page_to_iter(page, 0, n, iter);
+ if (!n)
+ return -EFAULT;
+ size -= n;
+ }
+ return 0;
+}
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
new file mode 100644
index 000000000000..71f64f4057b5
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -0,0 +1,36 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef __ORANGEFS_BUFMAP_H
+#define __ORANGEFS_BUFMAP_H
+
+int orangefs_bufmap_size_query(void);
+
+int orangefs_bufmap_shift_query(void);
+
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
+
+void orangefs_bufmap_finalize(void);
+
+void orangefs_bufmap_run_down(void);
+
+int orangefs_bufmap_get(void);
+
+void orangefs_bufmap_put(int buffer_index);
+
+int orangefs_readdir_index_get(void);
+
+void orangefs_readdir_index_put(int buffer_index);
+
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size);
+
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size);
+
+#endif /* __ORANGEFS_BUFMAP_H */
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
new file mode 100644
index 000000000000..900a2e38e11b
--- /dev/null
+++ b/fs/orangefs/orangefs-cache.c
@@ -0,0 +1,161 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* tags assigned to kernel upcall operations */
+static __u64 next_tag_value;
+static DEFINE_SPINLOCK(next_tag_value_lock);
+
+/* the orangefs memory caches */
+
+/* a cache for orangefs upcall/downcall operations */
+static struct kmem_cache *op_cache;
+
+int op_cache_initialize(void)
+{
+ op_cache = kmem_cache_create("orangefs_op_cache",
+ sizeof(struct orangefs_kernel_op_s),
+ 0,
+ ORANGEFS_CACHE_CREATE_FLAGS,
+ NULL);
+
+ if (!op_cache) {
+ gossip_err("Cannot create orangefs_op_cache\n");
+ return -ENOMEM;
+ }
+
+ /* initialize our atomic tag counter */
+ spin_lock(&next_tag_value_lock);
+ next_tag_value = 100;
+ spin_unlock(&next_tag_value_lock);
+ return 0;
+}
+
+int op_cache_finalize(void)
+{
+ kmem_cache_destroy(op_cache);
+ return 0;
+}
+
+char *get_opname_string(struct orangefs_kernel_op_s *new_op)
+{
+ if (new_op) {
+ __s32 type = new_op->upcall.type;
+
+ if (type == ORANGEFS_VFS_OP_FILE_IO)
+ return "OP_FILE_IO";
+ else if (type == ORANGEFS_VFS_OP_LOOKUP)
+ return "OP_LOOKUP";
+ else if (type == ORANGEFS_VFS_OP_CREATE)
+ return "OP_CREATE";
+ else if (type == ORANGEFS_VFS_OP_GETATTR)
+ return "OP_GETATTR";
+ else if (type == ORANGEFS_VFS_OP_REMOVE)
+ return "OP_REMOVE";
+ else if (type == ORANGEFS_VFS_OP_MKDIR)
+ return "OP_MKDIR";
+ else if (type == ORANGEFS_VFS_OP_READDIR)
+ return "OP_READDIR";
+ else if (type == ORANGEFS_VFS_OP_READDIRPLUS)
+ return "OP_READDIRPLUS";
+ else if (type == ORANGEFS_VFS_OP_SETATTR)
+ return "OP_SETATTR";
+ else if (type == ORANGEFS_VFS_OP_SYMLINK)
+ return "OP_SYMLINK";
+ else if (type == ORANGEFS_VFS_OP_RENAME)
+ return "OP_RENAME";
+ else if (type == ORANGEFS_VFS_OP_STATFS)
+ return "OP_STATFS";
+ else if (type == ORANGEFS_VFS_OP_TRUNCATE)
+ return "OP_TRUNCATE";
+ else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
+ return "OP_MMAP_RA_FLUSH";
+ else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
+ return "OP_FS_MOUNT";
+ else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
+ return "OP_FS_UMOUNT";
+ else if (type == ORANGEFS_VFS_OP_GETXATTR)
+ return "OP_GETXATTR";
+ else if (type == ORANGEFS_VFS_OP_SETXATTR)
+ return "OP_SETXATTR";
+ else if (type == ORANGEFS_VFS_OP_LISTXATTR)
+ return "OP_LISTXATTR";
+ else if (type == ORANGEFS_VFS_OP_REMOVEXATTR)
+ return "OP_REMOVEXATTR";
+ else if (type == ORANGEFS_VFS_OP_PARAM)
+ return "OP_PARAM";
+ else if (type == ORANGEFS_VFS_OP_PERF_COUNT)
+ return "OP_PERF_COUNT";
+ else if (type == ORANGEFS_VFS_OP_CANCEL)
+ return "OP_CANCEL";
+ else if (type == ORANGEFS_VFS_OP_FSYNC)
+ return "OP_FSYNC";
+ else if (type == ORANGEFS_VFS_OP_FSKEY)
+ return "OP_FSKEY";
+ }
+ return "OP_UNKNOWN?";
+}
+
+void orangefs_new_tag(struct orangefs_kernel_op_s *op)
+{
+ spin_lock(&next_tag_value_lock);
+ op->tag = next_tag_value++;
+ if (next_tag_value == 0)
+ next_tag_value = 100;
+ spin_unlock(&next_tag_value_lock);
+}
+
+struct orangefs_kernel_op_s *op_alloc(__s32 type)
+{
+ struct orangefs_kernel_op_s *new_op = NULL;
+
+ new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL);
+ if (new_op) {
+ INIT_LIST_HEAD(&new_op->list);
+ spin_lock_init(&new_op->lock);
+ init_completion(&new_op->waitq);
+
+ new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
+ new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+ new_op->downcall.status = -1;
+
+ new_op->op_state = OP_VFS_STATE_UNKNOWN;
+
+ /* initialize the op specific tag and upcall credentials */
+ orangefs_new_tag(new_op);
+ new_op->upcall.type = type;
+ new_op->attempts = 0;
+ gossip_debug(GOSSIP_CACHE_DEBUG,
+ "Alloced OP (%p: %llu %s)\n",
+ new_op,
+ llu(new_op->tag),
+ get_opname_string(new_op));
+
+ new_op->upcall.uid = from_kuid(current_user_ns(),
+ current_fsuid());
+
+ new_op->upcall.gid = from_kgid(current_user_ns(),
+ current_fsgid());
+ } else {
+ gossip_err("op_alloc: kmem_cache_zalloc failed!\n");
+ }
+ return new_op;
+}
+
+void op_release(struct orangefs_kernel_op_s *orangefs_op)
+{
+ if (orangefs_op) {
+ gossip_debug(GOSSIP_CACHE_DEBUG,
+ "Releasing OP (%p: %llu)\n",
+ orangefs_op,
+ llu(orangefs_op->tag));
+ kmem_cache_free(op_cache, orangefs_op);
+ } else {
+ gossip_err("NULL pointer in op_release\n");
+ }
+}
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
new file mode 100644
index 000000000000..387db17cde2b
--- /dev/null
+++ b/fs/orangefs/orangefs-debug.h
@@ -0,0 +1,92 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* This file just defines debugging masks to be used with the gossip
+ * logging utility. All debugging masks for ORANGEFS are kept here to make
+ * sure we don't have collisions.
+ */
+
+#ifndef __ORANGEFS_DEBUG_H
+#define __ORANGEFS_DEBUG_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#define GOSSIP_NO_DEBUG (__u64)0
+
+#define GOSSIP_SUPER_DEBUG ((__u64)1 << 0)
+#define GOSSIP_INODE_DEBUG ((__u64)1 << 1)
+#define GOSSIP_FILE_DEBUG ((__u64)1 << 2)
+#define GOSSIP_DIR_DEBUG ((__u64)1 << 3)
+#define GOSSIP_UTILS_DEBUG ((__u64)1 << 4)
+#define GOSSIP_WAIT_DEBUG ((__u64)1 << 5)
+#define GOSSIP_ACL_DEBUG ((__u64)1 << 6)
+#define GOSSIP_DCACHE_DEBUG ((__u64)1 << 7)
+#define GOSSIP_DEV_DEBUG ((__u64)1 << 8)
+#define GOSSIP_NAME_DEBUG ((__u64)1 << 9)
+#define GOSSIP_BUFMAP_DEBUG ((__u64)1 << 10)
+#define GOSSIP_CACHE_DEBUG ((__u64)1 << 11)
+#define GOSSIP_DEBUGFS_DEBUG ((__u64)1 << 12)
+#define GOSSIP_XATTR_DEBUG ((__u64)1 << 13)
+#define GOSSIP_INIT_DEBUG ((__u64)1 << 14)
+#define GOSSIP_SYSFS_DEBUG ((__u64)1 << 15)
+
+#define GOSSIP_MAX_NR 16
+#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
+
+/*function prototypes*/
+__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging);
+__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging);
+char *ORANGEFS_debug_mask_to_eventlog(__u64 mask);
+char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask);
+
+/* a private internal type */
+struct __keyword_mask_s {
+ const char *keyword;
+ __u64 mask_val;
+};
+
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ * "all" is always last...
+ *
+ * keyword mask_val index
+ * foo 1 0
+ * bar 2 1
+ * baz 4 2
+ * qux 8 3
+ * . . .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+ {"super", GOSSIP_SUPER_DEBUG},
+ {"inode", GOSSIP_INODE_DEBUG},
+ {"file", GOSSIP_FILE_DEBUG},
+ {"dir", GOSSIP_DIR_DEBUG},
+ {"utils", GOSSIP_UTILS_DEBUG},
+ {"wait", GOSSIP_WAIT_DEBUG},
+ {"acl", GOSSIP_ACL_DEBUG},
+ {"dcache", GOSSIP_DCACHE_DEBUG},
+ {"dev", GOSSIP_DEV_DEBUG},
+ {"name", GOSSIP_NAME_DEBUG},
+ {"bufmap", GOSSIP_BUFMAP_DEBUG},
+ {"cache", GOSSIP_CACHE_DEBUG},
+ {"debugfs", GOSSIP_DEBUGFS_DEBUG},
+ {"xattr", GOSSIP_XATTR_DEBUG},
+ {"init", GOSSIP_INIT_DEBUG},
+ {"sysfs", GOSSIP_SYSFS_DEBUG},
+ {"none", GOSSIP_NO_DEBUG},
+ {"all", GOSSIP_MAX_DEBUG}
+};
+
+static const int num_kmod_keyword_mask_map = (int)
+ (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
+
+#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
new file mode 100644
index 000000000000..19670b8b4053
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -0,0 +1,455 @@
+/*
+ * What: /sys/kernel/debug/orangefs/debug-help
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * List of client and kernel debug keywords.
+ *
+ *
+ * What: /sys/kernel/debug/orangefs/client-debug
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Debug setting for "the client", the userspace
+ * helper for the kernel module.
+ *
+ *
+ * What: /sys/kernel/debug/orangefs/kernel-debug
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Debug setting for the orangefs kernel module.
+ *
+ * Any of the keywords, or comma-separated lists
+ * of keywords, from debug-help can be catted to
+ * client-debug or kernel-debug.
+ *
+ * "none", "all" and "verbose" are special keywords
+ * for client-debug. Setting client-debug to "all"
+ * is kind of like trying to drink water from a
+ * fire hose, "verbose" triggers most of the same
+ * output except for the constant flow of output
+ * from the main wait loop.
+ *
+ * "none" and "all" are similar settings for kernel-debug
+ * no need for a "verbose".
+ */
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include "orangefs-debugfs.h"
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+static int orangefs_debug_disabled = 1;
+
+static int orangefs_debug_help_open(struct inode *, struct file *);
+
+const struct file_operations debug_help_fops = {
+ .open = orangefs_debug_help_open,
+ .read = seq_read,
+ .release = seq_release,
+ .llseek = seq_lseek,
+};
+
+static void *help_start(struct seq_file *, loff_t *);
+static void *help_next(struct seq_file *, void *, loff_t *);
+static void help_stop(struct seq_file *, void *);
+static int help_show(struct seq_file *, void *);
+
+static const struct seq_operations help_debug_ops = {
+ .start = help_start,
+ .next = help_next,
+ .stop = help_stop,
+ .show = help_show,
+};
+
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+
+int orangefs_debug_open(struct inode *, struct file *);
+
+static ssize_t orangefs_debug_read(struct file *,
+ char __user *,
+ size_t,
+ loff_t *);
+
+static ssize_t orangefs_debug_write(struct file *,
+ const char __user *,
+ size_t,
+ loff_t *);
+
+static const struct file_operations kernel_debug_fops = {
+ .open = orangefs_debug_open,
+ .read = orangefs_debug_read,
+ .write = orangefs_debug_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * initialize kmod debug operations, create orangefs debugfs dir and
+ * ORANGEFS_KMOD_DEBUG_HELP_FILE.
+ */
+int orangefs_debugfs_init(void)
+{
+
+ int rc = -ENOMEM;
+
+ debug_dir = debugfs_create_dir("orangefs", NULL);
+ if (!debug_dir) {
+ pr_info("%s: debugfs_create_dir failed.\n", __func__);
+ goto out;
+ }
+
+ help_file_dentry = debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE,
+ 0444,
+ debug_dir,
+ debug_help_string,
+ &debug_help_fops);
+ if (!help_file_dentry) {
+ pr_info("%s: debugfs_create_file failed.\n", __func__);
+ goto out;
+ }
+
+ orangefs_debug_disabled = 0;
+ rc = 0;
+
+out:
+
+ return rc;
+}
+
+void orangefs_debugfs_cleanup(void)
+{
+ if (debug_dir)
+ debugfs_remove_recursive(debug_dir);
+}
+
+/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
+static int orangefs_debug_help_open(struct inode *inode, struct file *file)
+{
+ int rc = -ENODEV;
+ int ret;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_help_open: start\n");
+
+ if (orangefs_debug_disabled)
+ goto out;
+
+ ret = seq_open(file, &help_debug_ops);
+ if (ret)
+ goto out;
+
+ ((struct seq_file *)(file->private_data))->private = inode->i_private;
+
+ rc = 0;
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_help_open: rc:%d:\n",
+ rc);
+ return rc;
+}
+
+/*
+ * I think start always gets called again after stop. Start
+ * needs to return NULL when it is done. The whole "payload"
+ * in this case is a single (long) string, so by the second
+ * time we get to start (pos = 1), we're done.
+ */
+static void *help_start(struct seq_file *m, loff_t *pos)
+{
+ void *payload = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n");
+
+ if (*pos == 0)
+ payload = m->private;
+
+ return payload;
+}
+
+static void *help_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
+
+ return NULL;
+}
+
+static void help_stop(struct seq_file *m, void *p)
+{
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n");
+}
+
+static int help_show(struct seq_file *m, void *v)
+{
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_show: start\n");
+
+ seq_puts(m, v);
+
+ return 0;
+}
+
+/*
+ * initialize the kernel-debug file.
+ */
+int orangefs_kernel_debug_init(void)
+{
+ int rc = -ENOMEM;
+ struct dentry *ret;
+ char *k_buffer = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+ k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!k_buffer)
+ goto out;
+
+ if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ strcpy(k_buffer, kernel_debug_string);
+ strcat(k_buffer, "\n");
+ } else {
+ strcpy(k_buffer, "none\n");
+ pr_info("%s: overflow 1!\n", __func__);
+ }
+
+ ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+ 0444,
+ debug_dir,
+ k_buffer,
+ &kernel_debug_fops);
+ if (!ret) {
+ pr_info("%s: failed to create %s.\n",
+ __func__,
+ ORANGEFS_KMOD_DEBUG_FILE);
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+ return rc;
+}
+
+/*
+ * initialize the client-debug file.
+ */
+int orangefs_client_debug_init(void)
+{
+
+ int rc = -ENOMEM;
+ char *c_buffer = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+ c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!c_buffer)
+ goto out;
+
+ if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ strcpy(c_buffer, client_debug_string);
+ strcat(c_buffer, "\n");
+ } else {
+ strcpy(c_buffer, "none\n");
+ pr_info("%s: overflow! 2\n", __func__);
+ }
+
+ client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
+ 0444,
+ debug_dir,
+ c_buffer,
+ &kernel_debug_fops);
+ if (!client_debug_dentry) {
+ pr_info("%s: failed to create updated %s.\n",
+ __func__,
+ ORANGEFS_CLIENT_DEBUG_FILE);
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+ return rc;
+}
+
+/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
+int orangefs_debug_open(struct inode *inode, struct file *file)
+{
+ int rc = -ENODEV;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "%s: orangefs_debug_disabled: %d\n",
+ __func__,
+ orangefs_debug_disabled);
+
+ if (orangefs_debug_disabled)
+ goto out;
+
+ rc = 0;
+ mutex_lock(&orangefs_debug_lock);
+ file->private_data = inode->i_private;
+ mutex_unlock(&orangefs_debug_lock);
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_open: rc: %d\n",
+ rc);
+ return rc;
+}
+
+static ssize_t orangefs_debug_read(struct file *file,
+ char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ char *buf;
+ int sprintf_ret;
+ ssize_t read_ret = -ENOMEM;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_read: start\n");
+
+ buf = kmalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ mutex_lock(&orangefs_debug_lock);
+ sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
+ mutex_unlock(&orangefs_debug_lock);
+
+ read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
+
+ kfree(buf);
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_read: ret: %zu\n",
+ read_ret);
+
+ return read_ret;
+}
+
+static ssize_t orangefs_debug_write(struct file *file,
+ const char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ char *buf;
+ int rc = -EFAULT;
+ size_t silly = 0;
+ char *debug_string;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ struct client_debug_mask c_mask = { NULL, 0, 0 };
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_write: %s\n",
+ file->f_path.dentry->d_name.name);
+
+ /*
+ * Thwart users who try to jamb a ridiculous number
+ * of bytes into the debug file...
+ */
+ if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+ silly = count;
+ count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+ }
+
+ buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ if (copy_from_user(buf, ubuf, count - 1)) {
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "%s: copy_from_user failed!\n",
+ __func__);
+ goto out;
+ }
+
+ /*
+ * Map the keyword string from userspace into a valid debug mask.
+ * The mapping process involves mapping the human-inputted string
+ * into a valid mask, and then rebuilding the string from the
+ * verified valid mask.
+ *
+ * A service operation is required to set a new client-side
+ * debug mask.
+ */
+ if (!strcmp(file->f_path.dentry->d_name.name,
+ ORANGEFS_KMOD_DEBUG_FILE)) {
+ debug_string_to_mask(buf, &gossip_debug_mask, 0);
+ debug_mask_to_string(&gossip_debug_mask, 0);
+ debug_string = kernel_debug_string;
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "New kernel debug string is %s\n",
+ kernel_debug_string);
+ } else {
+ /* Can't reset client debug mask if client is not running. */
+ if (is_daemon_in_service()) {
+ pr_info("%s: Client not running :%d:\n",
+ __func__,
+ is_daemon_in_service());
+ goto out;
+ }
+
+ debug_string_to_mask(buf, &c_mask, 1);
+ debug_mask_to_string(&c_mask, 1);
+ debug_string = client_debug_string;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+ if (!new_op) {
+ pr_info("%s: op_alloc failed!\n", __func__);
+ goto out;
+ }
+
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES;
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+ memset(new_op->upcall.req.param.s_value,
+ 0,
+ ORANGEFS_MAX_DEBUG_STRING_LEN);
+ sprintf(new_op->upcall.req.param.s_value,
+ "%llx %llx\n",
+ c_mask.mask1,
+ c_mask.mask2);
+
+ /* service_operation returns 0 on success... */
+ rc = service_operation(new_op,
+ "orangefs_param",
+ ORANGEFS_OP_INTERRUPTIBLE);
+
+ if (rc)
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "%s: service_operation failed! rc:%d:\n",
+ __func__,
+ rc);
+
+ op_release(new_op);
+ }
+
+ mutex_lock(&orangefs_debug_lock);
+ memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+ sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+ mutex_unlock(&orangefs_debug_lock);
+
+ *ppos += count;
+ if (silly)
+ rc = silly;
+ else
+ rc = count;
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_write: rc: %d\n",
+ rc);
+ kfree(buf);
+ return rc;
+}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
new file mode 100644
index 000000000000..e4828c0e3ef9
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -0,0 +1,3 @@
+int orangefs_debugfs_init(void);
+int orangefs_kernel_debug_init(void);
+void orangefs_debugfs_cleanup(void);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
new file mode 100644
index 000000000000..9eac9d9a3f3a
--- /dev/null
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -0,0 +1,62 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef _ORANGEFS_DEV_PROTO_H
+#define _ORANGEFS_DEV_PROTO_H
+
+/*
+ * types and constants shared between user space and kernel space for
+ * device interaction using a common protocol
+ */
+
+/*
+ * valid orangefs kernel operation types
+ */
+#define ORANGEFS_VFS_OP_INVALID 0xFF000000
+#define ORANGEFS_VFS_OP_FILE_IO 0xFF000001
+#define ORANGEFS_VFS_OP_LOOKUP 0xFF000002
+#define ORANGEFS_VFS_OP_CREATE 0xFF000003
+#define ORANGEFS_VFS_OP_GETATTR 0xFF000004
+#define ORANGEFS_VFS_OP_REMOVE 0xFF000005
+#define ORANGEFS_VFS_OP_MKDIR 0xFF000006
+#define ORANGEFS_VFS_OP_READDIR 0xFF000007
+#define ORANGEFS_VFS_OP_SETATTR 0xFF000008
+#define ORANGEFS_VFS_OP_SYMLINK 0xFF000009
+#define ORANGEFS_VFS_OP_RENAME 0xFF00000A
+#define ORANGEFS_VFS_OP_STATFS 0xFF00000B
+#define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C
+#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D
+#define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E
+#define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F
+#define ORANGEFS_VFS_OP_GETXATTR 0xFF000010
+#define ORANGEFS_VFS_OP_SETXATTR 0xFF000011
+#define ORANGEFS_VFS_OP_LISTXATTR 0xFF000012
+#define ORANGEFS_VFS_OP_REMOVEXATTR 0xFF000013
+#define ORANGEFS_VFS_OP_PARAM 0xFF000014
+#define ORANGEFS_VFS_OP_PERF_COUNT 0xFF000015
+#define ORANGEFS_VFS_OP_CANCEL 0xFF00EE00
+#define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01
+#define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02
+#define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03
+
+/*
+ * Misc constants. Please retain them as multiples of 8!
+ * Otherwise 32-64 bit interactions will be messed up :)
+ */
+#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000400
+#define ORANGEFS_MAX_DEBUG_ARRAY_LEN 0x00000800
+
+/*
+ * The maximum number of directory entries in a single request is 96.
+ * XXX: Why can this not be higher. The client-side code can handle up to 512.
+ * XXX: What happens if we expect more than the client can return?
+ */
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+
+#include "upcall.h"
+#include "downcall.h"
+
+#endif
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
new file mode 100644
index 000000000000..a9925e296ceb
--- /dev/null
+++ b/fs/orangefs/orangefs-kernel.h
@@ -0,0 +1,623 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * The ORANGEFS Linux kernel support allows ORANGEFS volumes to be mounted and
+ * accessed through the Linux VFS (i.e. using standard I/O system calls).
+ * This support is only needed on clients that wish to mount the file system.
+ *
+ */
+
+/*
+ * Declarations and macros for the ORANGEFS Linux kernel support.
+ */
+
+#ifndef __ORANGEFSKERNEL_H
+#define __ORANGEFSKERNEL_H
+
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/statfs.h>
+#include <linux/backing-dev.h>
+#include <linux/device.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include <linux/aio.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+
+#include <asm/unaligned.h>
+
+#include "orangefs-dev-proto.h"
+
+#ifdef ORANGEFS_KERNEL_DEBUG
+#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 10
+#else
+#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 20
+#endif
+
+#define ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS 30
+
+#define ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS 900 /* 15 minutes */
+
+#define ORANGEFS_REQDEVICE_NAME "pvfs2-req"
+
+#define ORANGEFS_DEVREQ_MAGIC 0x20030529
+#define ORANGEFS_LINK_MAX 0x000000FF
+#define ORANGEFS_PURGE_RETRY_COUNT 0x00000005
+#define ORANGEFS_MAX_NUM_OPTIONS 0x00000004
+#define ORANGEFS_MAX_MOUNT_OPT_LEN 0x00000080
+#define ORANGEFS_MAX_FSKEY_LEN 64
+
+#define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \
+sizeof(__u64) + sizeof(struct orangefs_upcall_s))
+#define MAX_DEV_REQ_DOWNSIZE (2 * sizeof(__s32) + \
+sizeof(__u64) + sizeof(struct orangefs_downcall_s))
+
+/*
+ * valid orangefs kernel operation states
+ *
+ * unknown - op was just initialized
+ * waiting - op is on request_list (upward bound)
+ * inprogr - op is in progress (waiting for downcall)
+ * serviced - op has matching downcall; ok
+ * purged - op has to start a timer since client-core
+ * exited uncleanly before servicing op
+ * given up - submitter has given up waiting for it
+ */
+enum orangefs_vfs_op_states {
+ OP_VFS_STATE_UNKNOWN = 0,
+ OP_VFS_STATE_WAITING = 1,
+ OP_VFS_STATE_INPROGR = 2,
+ OP_VFS_STATE_SERVICED = 4,
+ OP_VFS_STATE_PURGED = 8,
+ OP_VFS_STATE_GIVEN_UP = 16,
+};
+
+/*
+ * An array of client_debug_mask will be built to hold debug keyword/mask
+ * values fetched from userspace.
+ */
+struct client_debug_mask {
+ char *keyword;
+ __u64 mask1;
+ __u64 mask2;
+};
+
+/*
+ * orangefs kernel memory related flags
+ */
+
+#if ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB))
+#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
+#else
+#define ORANGEFS_CACHE_CREATE_FLAGS 0
+#endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
+
+/* orangefs xattr and acl related defines */
+#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define ORANGEFS_XATTR_INDEX_TRUSTED 3
+#define ORANGEFS_XATTR_INDEX_DEFAULT 4
+
+#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS
+#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT
+#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted."
+#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX ""
+
+/* these functions are defined in orangefs-utils.c */
+int orangefs_prepare_cdm_array(char *debug_array_string);
+int orangefs_prepare_debugfs_help_string(int);
+
+/* defined in orangefs-debugfs.c */
+int orangefs_client_debug_init(void);
+
+void debug_string_to_mask(char *, void *, int);
+void do_c_mask(int, char *, struct client_debug_mask **);
+void do_k_mask(int, char *, __u64 **);
+
+void debug_mask_to_string(void *, int);
+void do_k_string(void *, int);
+void do_c_string(void *, int);
+int check_amalgam_keyword(void *, int);
+int keyword_is_amalgam(char *);
+
+/*these variables are defined in orangefs-mod.c */
+extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern unsigned int kernel_mask_set_mod_init;
+
+extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
+extern const struct xattr_handler *orangefs_xattr_handlers[];
+
+extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
+extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
+/*
+ * Redefine xtvec structure so that we could move helper functions out of
+ * the define
+ */
+struct xtvec {
+ __kernel_off_t xtv_off; /* must be off_t */
+ __kernel_size_t xtv_len; /* must be size_t */
+};
+
+/*
+ * orangefs data structures
+ */
+struct orangefs_kernel_op_s {
+ enum orangefs_vfs_op_states op_state;
+ __u64 tag;
+
+ /*
+ * Set uses_shared_memory to non zero if this operation uses
+ * shared memory. If true, then a retry on the op must also
+ * get a new shared memory buffer and re-populate it.
+ * Cancels don't care - it only matters for service_operation()
+ * retry logics and cancels don't go through it anymore. It
+ * safely stays non-zero when we use it as slot_to_free.
+ */
+ union {
+ int uses_shared_memory;
+ int slot_to_free;
+ };
+
+ struct orangefs_upcall_s upcall;
+ struct orangefs_downcall_s downcall;
+
+ struct completion waitq;
+ spinlock_t lock;
+
+ int attempts;
+
+ struct list_head list;
+};
+
+#define set_op_state_waiting(op) ((op)->op_state = OP_VFS_STATE_WAITING)
+#define set_op_state_inprogress(op) ((op)->op_state = OP_VFS_STATE_INPROGR)
+#define set_op_state_given_up(op) ((op)->op_state = OP_VFS_STATE_GIVEN_UP)
+static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
+{
+ op->op_state = OP_VFS_STATE_SERVICED;
+ complete(&op->waitq);
+}
+
+#define op_state_waiting(op) ((op)->op_state & OP_VFS_STATE_WAITING)
+#define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR)
+#define op_state_serviced(op) ((op)->op_state & OP_VFS_STATE_SERVICED)
+#define op_state_purged(op) ((op)->op_state & OP_VFS_STATE_PURGED)
+#define op_state_given_up(op) ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
+#define op_is_cancel(op) ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL)
+
+void op_release(struct orangefs_kernel_op_s *op);
+
+extern void orangefs_bufmap_put(int);
+static inline void put_cancel(struct orangefs_kernel_op_s *op)
+{
+ orangefs_bufmap_put(op->slot_to_free);
+ op_release(op);
+}
+
+static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
+{
+ spin_lock(&op->lock);
+ if (unlikely(op_is_cancel(op))) {
+ list_del_init(&op->list);
+ spin_unlock(&op->lock);
+ put_cancel(op);
+ } else {
+ op->op_state |= OP_VFS_STATE_PURGED;
+ complete(&op->waitq);
+ spin_unlock(&op->lock);
+ }
+}
+
+/* per inode private orangefs info */
+struct orangefs_inode_s {
+ struct orangefs_object_kref refn;
+ char link_target[ORANGEFS_NAME_MAX];
+ __s64 blksize;
+ /*
+ * Reading/Writing Extended attributes need to acquire the appropriate
+ * reader/writer semaphore on the orangefs_inode_s structure.
+ */
+ struct rw_semaphore xattr_sem;
+
+ struct inode vfs_inode;
+ sector_t last_failed_block_index_read;
+
+ /*
+ * State of in-memory attributes not yet flushed to disk associated
+ * with this object
+ */
+ unsigned long pinode_flags;
+};
+
+#define P_ATIME_FLAG 0
+#define P_MTIME_FLAG 1
+#define P_CTIME_FLAG 2
+#define P_MODE_FLAG 3
+
+#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define SetAtimeFlag(pinode) set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define AtimeFlag(pinode) test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+
+#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define SetMtimeFlag(pinode) set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define MtimeFlag(pinode) test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+
+#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define SetCtimeFlag(pinode) set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define CtimeFlag(pinode) test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+
+#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+#define SetModeFlag(pinode) set_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+#define ModeFlag(pinode) test_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+
+/* per superblock private orangefs info */
+struct orangefs_sb_info_s {
+ struct orangefs_khandle root_khandle;
+ __s32 fs_id;
+ int id;
+ int flags;
+#define ORANGEFS_OPT_INTR 0x01
+#define ORANGEFS_OPT_LOCAL_LOCK 0x02
+ char devname[ORANGEFS_MAX_SERVER_ADDR_LEN];
+ struct super_block *sb;
+ int mount_pending;
+ struct list_head list;
+};
+
+/*
+ * structure that holds the state of any async I/O operation issued
+ * through the VFS. Needed especially to handle cancellation requests
+ * or even completion notification so that the VFS client-side daemon
+ * can free up its vfs_request slots.
+ */
+struct orangefs_kiocb_s {
+ /* the pointer to the task that initiated the AIO */
+ struct task_struct *tsk;
+
+ /* pointer to the kiocb that kicked this operation */
+ struct kiocb *kiocb;
+
+ /* buffer index that was used for the I/O */
+ struct orangefs_bufmap *bufmap;
+ int buffer_index;
+
+ /* orangefs kernel operation type */
+ struct orangefs_kernel_op_s *op;
+
+ /* The user space buffers from/to which I/O is being staged */
+ struct iovec *iov;
+
+ /* number of elements in the iovector */
+ unsigned long nr_segs;
+
+ /* set to indicate the type of the operation */
+ int rw;
+
+ /* file offset */
+ loff_t offset;
+
+ /* and the count in bytes */
+ size_t bytes_to_be_copied;
+
+ ssize_t bytes_copied;
+ int needs_cleanup;
+};
+
+struct orangefs_stats {
+ unsigned long cache_hits;
+ unsigned long cache_misses;
+ unsigned long reads;
+ unsigned long writes;
+};
+
+extern struct orangefs_stats g_orangefs_stats;
+
+/*
+ * NOTE: See Documentation/filesystems/porting for information
+ * on implementing FOO_I and properly accessing fs private data
+ */
+static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode)
+{
+ return container_of(inode, struct orangefs_inode_s, vfs_inode);
+}
+
+static inline struct orangefs_sb_info_s *ORANGEFS_SB(struct super_block *sb)
+{
+ return (struct orangefs_sb_info_s *) sb->s_fs_info;
+}
+
+/* ino_t descends from "unsigned long", 8 bytes, 64 bits. */
+static inline ino_t orangefs_khandle_to_ino(struct orangefs_khandle *khandle)
+{
+ union {
+ unsigned char u[8];
+ __u64 ino;
+ } ihandle;
+
+ ihandle.u[0] = khandle->u[0] ^ khandle->u[4];
+ ihandle.u[1] = khandle->u[1] ^ khandle->u[5];
+ ihandle.u[2] = khandle->u[2] ^ khandle->u[6];
+ ihandle.u[3] = khandle->u[3] ^ khandle->u[7];
+ ihandle.u[4] = khandle->u[12] ^ khandle->u[8];
+ ihandle.u[5] = khandle->u[13] ^ khandle->u[9];
+ ihandle.u[6] = khandle->u[14] ^ khandle->u[10];
+ ihandle.u[7] = khandle->u[15] ^ khandle->u[11];
+
+ return ihandle.ino;
+}
+
+static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
+{
+ return &(ORANGEFS_I(inode)->refn.khandle);
+}
+
+static inline __s32 get_fsid_from_ino(struct inode *inode)
+{
+ return ORANGEFS_I(inode)->refn.fs_id;
+}
+
+static inline ino_t get_ino_from_khandle(struct inode *inode)
+{
+ struct orangefs_khandle *khandle;
+ ino_t ino;
+
+ khandle = get_khandle_from_ino(inode);
+ ino = orangefs_khandle_to_ino(khandle);
+ return ino;
+}
+
+static inline ino_t get_parent_ino_from_dentry(struct dentry *dentry)
+{
+ return get_ino_from_khandle(dentry->d_parent->d_inode);
+}
+
+static inline int is_root_handle(struct inode *inode)
+{
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s: root handle: %pU, this handle: %pU:\n",
+ __func__,
+ &ORANGEFS_SB(inode->i_sb)->root_khandle,
+ get_khandle_from_ino(inode));
+
+ if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle),
+ get_khandle_from_ino(inode)))
+ return 0;
+ else
+ return 1;
+}
+
+static inline int match_handle(struct orangefs_khandle resp_handle,
+ struct inode *inode)
+{
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s: one handle: %pU, another handle:%pU:\n",
+ __func__,
+ &resp_handle,
+ get_khandle_from_ino(inode));
+
+ if (ORANGEFS_khandle_cmp(&resp_handle, get_khandle_from_ino(inode)))
+ return 0;
+ else
+ return 1;
+}
+
+/*
+ * defined in orangefs-cache.c
+ */
+int op_cache_initialize(void);
+int op_cache_finalize(void);
+struct orangefs_kernel_op_s *op_alloc(__s32 type);
+void orangefs_new_tag(struct orangefs_kernel_op_s *op);
+char *get_opname_string(struct orangefs_kernel_op_s *new_op);
+
+int orangefs_inode_cache_initialize(void);
+int orangefs_inode_cache_finalize(void);
+
+/*
+ * defined in orangefs-mod.c
+ */
+void purge_inprogress_ops(void);
+
+/*
+ * defined in waitqueue.c
+ */
+void purge_waiting_ops(void);
+
+/*
+ * defined in super.c
+ */
+struct dentry *orangefs_mount(struct file_system_type *fst,
+ int flags,
+ const char *devname,
+ void *data);
+
+void orangefs_kill_sb(struct super_block *sb);
+int orangefs_remount(struct orangefs_sb_info_s *);
+
+int fsid_key_table_initialize(void);
+void fsid_key_table_finalize(void);
+
+/*
+ * defined in inode.c
+ */
+__u32 convert_to_orangefs_mask(unsigned long lite_mask);
+struct inode *orangefs_new_inode(struct super_block *sb,
+ struct inode *dir,
+ int mode,
+ dev_t dev,
+ struct orangefs_object_kref *ref);
+
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
+
+int orangefs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct kstat *kstat);
+
+int orangefs_permission(struct inode *inode, int mask);
+
+/*
+ * defined in xattr.c
+ */
+int orangefs_setxattr(struct dentry *dentry,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags);
+
+ssize_t orangefs_getxattr(struct dentry *dentry,
+ const char *name,
+ void *buffer,
+ size_t size);
+
+ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/*
+ * defined in namei.c
+ */
+struct inode *orangefs_iget(struct super_block *sb,
+ struct orangefs_object_kref *ref);
+
+ssize_t orangefs_inode_read(struct inode *inode,
+ struct iov_iter *iter,
+ loff_t *offset,
+ loff_t readahead_size);
+
+/*
+ * defined in devorangefs-req.c
+ */
+int orangefs_dev_init(void);
+void orangefs_dev_cleanup(void);
+int is_daemon_in_service(void);
+bool __is_daemon_in_service(void);
+
+/*
+ * defined in orangefs-utils.c
+ */
+__s32 fsid_of_op(struct orangefs_kernel_op_s *op);
+
+int orangefs_flush_inode(struct inode *inode);
+
+ssize_t orangefs_inode_getxattr(struct inode *inode,
+ const char *prefix,
+ const char *name,
+ void *buffer,
+ size_t size);
+
+int orangefs_inode_setxattr(struct inode *inode,
+ const char *prefix,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags);
+
+int orangefs_inode_getattr(struct inode *inode, int new, int size);
+
+int orangefs_inode_check_changed(struct inode *inode);
+
+int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
+
+void orangefs_make_bad_inode(struct inode *inode);
+
+int orangefs_unmount_sb(struct super_block *sb);
+
+bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
+
+int orangefs_normalize_to_errno(__s32 error_code);
+
+extern struct mutex devreq_mutex;
+extern struct mutex request_mutex;
+extern int debug;
+extern int op_timeout_secs;
+extern int slot_timeout_secs;
+extern struct list_head orangefs_superblocks;
+extern spinlock_t orangefs_superblocks_lock;
+extern struct list_head orangefs_request_list;
+extern spinlock_t orangefs_request_list_lock;
+extern wait_queue_head_t orangefs_request_list_waitq;
+extern struct list_head *htable_ops_in_progress;
+extern spinlock_t htable_ops_in_progress_lock;
+extern int hash_table_size;
+
+extern const struct address_space_operations orangefs_address_operations;
+extern struct backing_dev_info orangefs_backing_dev_info;
+extern struct inode_operations orangefs_file_inode_operations;
+extern const struct file_operations orangefs_file_operations;
+extern struct inode_operations orangefs_symlink_inode_operations;
+extern struct inode_operations orangefs_dir_inode_operations;
+extern const struct file_operations orangefs_dir_operations;
+extern const struct dentry_operations orangefs_dentry_operations;
+extern const struct file_operations orangefs_devreq_file_operations;
+
+extern wait_queue_head_t orangefs_bufmap_init_waitq;
+
+/*
+ * misc convenience macros
+ */
+
+#define ORANGEFS_OP_INTERRUPTIBLE 1 /* service_operation() is interruptible */
+#define ORANGEFS_OP_PRIORITY 2 /* service_operation() is high priority */
+#define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */
+#define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */
+#define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */
+
+int service_operation(struct orangefs_kernel_op_s *op,
+ const char *op_name,
+ int flags);
+
+#define get_interruptible_flag(inode) \
+ ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
+ ORANGEFS_OP_INTERRUPTIBLE : 0)
+
+#define fill_default_sys_attrs(sys_attr, type, mode) \
+do { \
+ sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \
+ sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \
+ sys_attr.perms = ORANGEFS_util_translate_mode(mode); \
+ sys_attr.mtime = 0; \
+ sys_attr.atime = 0; \
+ sys_attr.ctime = 0; \
+ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \
+} while (0)
+
+static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ mutex_lock(&inode->i_mutex);
+#endif
+ i_size_write(inode, i_size);
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ mutex_unlock(&inode->i_mutex);
+#endif
+}
+
+#endif /* __ORANGEFSKERNEL_H */
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
new file mode 100644
index 000000000000..6f072a8c0de1
--- /dev/null
+++ b/fs/orangefs/orangefs-mod.c
@@ -0,0 +1,293 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add proc file handler for pvfs2 client
+ * parameters, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-debugfs.h"
+#include "orangefs-sysfs.h"
+
+/* ORANGEFS_VERSION is a ./configure define */
+#ifndef ORANGEFS_VERSION
+#define ORANGEFS_VERSION "upstream"
+#endif
+
+/*
+ * global variables declared here
+ */
+
+/* array of client debug keyword/mask values */
+struct client_debug_mask *cdm_array;
+int cdm_element_count;
+
+char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
+char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+
+char *debug_help_string;
+int help_string_initialized;
+struct dentry *help_file_dentry;
+struct dentry *client_debug_dentry;
+struct dentry *debug_dir;
+int client_verbose_index;
+int client_all_index;
+struct orangefs_stats g_orangefs_stats;
+
+/* the size of the hash tables for ops in progress */
+int hash_table_size = 509;
+
+static ulong module_parm_debug_mask;
+__u64 gossip_debug_mask;
+struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
+unsigned int kernel_mask_set_mod_init; /* implicitly false */
+int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
+int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("ORANGEFS Development Team");
+MODULE_DESCRIPTION("The Linux Kernel VFS interface to ORANGEFS");
+MODULE_PARM_DESC(module_parm_debug_mask, "debugging level (see orangefs-debug.h for values)");
+MODULE_PARM_DESC(op_timeout_secs, "Operation timeout in seconds");
+MODULE_PARM_DESC(slot_timeout_secs, "Slot timeout in seconds");
+MODULE_PARM_DESC(hash_table_size,
+ "size of hash table for operations in progress");
+
+static struct file_system_type orangefs_fs_type = {
+ .name = "pvfs2",
+ .mount = orangefs_mount,
+ .kill_sb = orangefs_kill_sb,
+ .owner = THIS_MODULE,
+};
+
+module_param(hash_table_size, int, 0);
+module_param(module_parm_debug_mask, ulong, 0644);
+module_param(op_timeout_secs, int, 0);
+module_param(slot_timeout_secs, int, 0);
+
+/* synchronizes the request device file */
+DEFINE_MUTEX(devreq_mutex);
+
+/*
+ * Blocks non-priority requests from being queued for servicing. This
+ * could be used for protecting the request list data structure, but
+ * for now it's only being used to stall the op addition to the request
+ * list
+ */
+DEFINE_MUTEX(request_mutex);
+
+/* hash table for storing operations waiting for matching downcall */
+struct list_head *htable_ops_in_progress;
+DEFINE_SPINLOCK(htable_ops_in_progress_lock);
+
+/* list for queueing upcall operations */
+LIST_HEAD(orangefs_request_list);
+
+/* used to protect the above orangefs_request_list */
+DEFINE_SPINLOCK(orangefs_request_list_lock);
+
+/* used for incoming request notification */
+DECLARE_WAIT_QUEUE_HEAD(orangefs_request_list_waitq);
+
+static int __init orangefs_init(void)
+{
+ int ret = -1;
+ __u32 i = 0;
+
+ /* convert input debug mask to a 64-bit unsigned integer */
+ gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
+
+ /*
+ * set the kernel's gossip debug string; invalid mask values will
+ * be ignored.
+ */
+ debug_mask_to_string(&gossip_debug_mask, 0);
+
+ /* remove any invalid values from the mask */
+ debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
+
+ /*
+ * if the mask has a non-zero value, then indicate that the mask
+ * was set when the kernel module was loaded. The orangefs dev ioctl
+ * command will look at this boolean to determine if the kernel's
+ * debug mask should be overwritten when the client-core is started.
+ */
+ if (gossip_debug_mask != 0)
+ kernel_mask_set_mod_init = true;
+
+ pr_info("%s: called with debug mask: :%s: :%llx:\n",
+ __func__,
+ kernel_debug_string,
+ (unsigned long long)gossip_debug_mask);
+
+ ret = bdi_init(&orangefs_backing_dev_info);
+
+ if (ret)
+ return ret;
+
+ if (op_timeout_secs < 0)
+ op_timeout_secs = 0;
+
+ if (slot_timeout_secs < 0)
+ slot_timeout_secs = 0;
+
+ /* initialize global book keeping data structures */
+ ret = op_cache_initialize();
+ if (ret < 0)
+ goto err;
+
+ ret = orangefs_inode_cache_initialize();
+ if (ret < 0)
+ goto cleanup_op;
+
+ htable_ops_in_progress =
+ kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
+ if (!htable_ops_in_progress) {
+ gossip_err("Failed to initialize op hashtable");
+ ret = -ENOMEM;
+ goto cleanup_inode;
+ }
+
+ /* initialize a doubly linked at each hash table index */
+ for (i = 0; i < hash_table_size; i++)
+ INIT_LIST_HEAD(&htable_ops_in_progress[i]);
+
+ ret = fsid_key_table_initialize();
+ if (ret < 0)
+ goto cleanup_progress_table;
+
+ /*
+ * Build the contents of /sys/kernel/debug/orangefs/debug-help
+ * from the keywords in the kernel keyword/mask array.
+ *
+ * The keywords in the client keyword/mask array are
+ * unknown at boot time.
+ *
+ * orangefs_prepare_debugfs_help_string will be used again
+ * later to rebuild the debug-help file after the client starts
+ * and passes along the needed info. The argument signifies
+ * which time orangefs_prepare_debugfs_help_string is being
+ * called.
+ */
+ ret = orangefs_prepare_debugfs_help_string(1);
+ if (ret)
+ goto cleanup_key_table;
+
+ ret = orangefs_debugfs_init();
+ if (ret)
+ goto debugfs_init_failed;
+
+ ret = orangefs_kernel_debug_init();
+ if (ret)
+ goto kernel_debug_init_failed;
+
+ ret = orangefs_sysfs_init();
+ if (ret)
+ goto sysfs_init_failed;
+
+ /* Initialize the orangefsdev subsystem. */
+ ret = orangefs_dev_init();
+ if (ret < 0) {
+ gossip_err("%s: could not initialize device subsystem %d!\n",
+ __func__,
+ ret);
+ goto cleanup_device;
+ }
+
+ ret = register_filesystem(&orangefs_fs_type);
+ if (ret == 0) {
+ pr_info("orangefs: module version %s loaded\n", ORANGEFS_VERSION);
+ ret = 0;
+ goto out;
+ }
+
+ orangefs_sysfs_exit();
+
+cleanup_device:
+ orangefs_dev_cleanup();
+
+sysfs_init_failed:
+
+kernel_debug_init_failed:
+
+debugfs_init_failed:
+ orangefs_debugfs_cleanup();
+
+cleanup_key_table:
+ fsid_key_table_finalize();
+
+cleanup_progress_table:
+ kfree(htable_ops_in_progress);
+
+cleanup_inode:
+ orangefs_inode_cache_finalize();
+
+cleanup_op:
+ op_cache_finalize();
+
+err:
+ bdi_destroy(&orangefs_backing_dev_info);
+
+out:
+ return ret;
+}
+
+static void __exit orangefs_exit(void)
+{
+ int i = 0;
+ gossip_debug(GOSSIP_INIT_DEBUG, "orangefs: orangefs_exit called\n");
+
+ unregister_filesystem(&orangefs_fs_type);
+ orangefs_debugfs_cleanup();
+ orangefs_sysfs_exit();
+ fsid_key_table_finalize();
+ orangefs_dev_cleanup();
+ BUG_ON(!list_empty(&orangefs_request_list));
+ for (i = 0; i < hash_table_size; i++)
+ BUG_ON(!list_empty(&htable_ops_in_progress[i]));
+
+ orangefs_inode_cache_finalize();
+ op_cache_finalize();
+
+ kfree(htable_ops_in_progress);
+
+ bdi_destroy(&orangefs_backing_dev_info);
+
+ pr_info("orangefs: module version %s unloaded\n", ORANGEFS_VERSION);
+}
+
+/*
+ * What we do in this function is to walk the list of operations
+ * that are in progress in the hash table and mark them as purged as well.
+ */
+void purge_inprogress_ops(void)
+{
+ int i;
+
+ for (i = 0; i < hash_table_size; i++) {
+ struct orangefs_kernel_op_s *op;
+ struct orangefs_kernel_op_s *next;
+
+ spin_lock(&htable_ops_in_progress_lock);
+ list_for_each_entry_safe(op,
+ next,
+ &htable_ops_in_progress[i],
+ list) {
+ set_op_state_purged(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ }
+ spin_unlock(&htable_ops_in_progress_lock);
+ }
+}
+
+module_init(orangefs_init);
+module_exit(orangefs_exit);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
new file mode 100644
index 000000000000..5c03113e3ad2
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -0,0 +1,1772 @@
+/*
+ * Documentation/ABI/stable/orangefs-sysfs:
+ *
+ * What: /sys/fs/orangefs/perf_counter_reset
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * echo a 0 or a 1 into perf_counter_reset to
+ * reset all the counters in
+ * /sys/fs/orangefs/perf_counters
+ * except ones with PINT_PERF_PRESERVE set.
+ *
+ *
+ * What: /sys/fs/orangefs/perf_counters/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Counters and settings for various caches.
+ * Read only.
+ *
+ *
+ * What: /sys/fs/orangefs/perf_time_interval_secs
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Length of perf counter intervals in
+ * seconds.
+ *
+ *
+ * What: /sys/fs/orangefs/perf_history_size
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * The perf_counters cache statistics have N, or
+ * perf_history_size, samples. The default is
+ * one.
+ *
+ * Every perf_time_interval_secs the (first)
+ * samples are reset.
+ *
+ * If N is greater than one, the "current" set
+ * of samples is reset, and the samples from the
+ * other N-1 intervals remain available.
+ *
+ *
+ * What: /sys/fs/orangefs/op_timeout_secs
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Service operation timeout in seconds.
+ *
+ *
+ * What: /sys/fs/orangefs/slot_timeout_secs
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * "Slot" timeout in seconds. A "slot"
+ * is an indexed buffer in the shared
+ * memory segment used for communication
+ * between the kernel module and userspace.
+ * Slots are requested and waited for,
+ * the wait times out after slot_timeout_secs.
+ *
+ *
+ * What: /sys/fs/orangefs/acache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Attribute cache configurable settings.
+ *
+ *
+ * What: /sys/fs/orangefs/ncache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Name cache configurable settings.
+ *
+ *
+ * What: /sys/fs/orangefs/capcache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Capability cache configurable settings.
+ *
+ *
+ * What: /sys/fs/orangefs/ccache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Credential cache configurable settings.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-sysfs.h"
+
+#define ORANGEFS_KOBJ_ID "orangefs"
+#define ACACHE_KOBJ_ID "acache"
+#define CAPCACHE_KOBJ_ID "capcache"
+#define CCACHE_KOBJ_ID "ccache"
+#define NCACHE_KOBJ_ID "ncache"
+#define PC_KOBJ_ID "pc"
+#define STATS_KOBJ_ID "stats"
+
+struct orangefs_obj {
+ struct kobject kobj;
+ int op_timeout_secs;
+ int perf_counter_reset;
+ int perf_history_size;
+ int perf_time_interval_secs;
+ int slot_timeout_secs;
+};
+
+struct acache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_msecs;
+};
+
+struct capcache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_secs;
+};
+
+struct ccache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_secs;
+};
+
+struct ncache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_msecs;
+};
+
+struct pc_orangefs_obj {
+ struct kobject kobj;
+ char *acache;
+ char *capcache;
+ char *ncache;
+};
+
+struct stats_orangefs_obj {
+ struct kobject kobj;
+ int reads;
+ int writes;
+};
+
+struct orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct acache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct capcache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct ccache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct ncache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct pc_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
+ struct pc_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
+ struct pc_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct stats_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
+ struct stats_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
+ struct stats_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+static ssize_t orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct orangefs_attribute *attribute;
+ struct orangefs_obj *orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct orangefs_attribute, attr);
+ orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct orangefs_attribute *attribute;
+ struct orangefs_obj *orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "orangefs_attr_store: start\n");
+
+ attribute = container_of(attr, struct orangefs_attribute, attr);
+ orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops orangefs_sysfs_ops = {
+ .show = orangefs_attr_show,
+ .store = orangefs_attr_store,
+};
+
+static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct acache_orangefs_attribute *attribute;
+ struct acache_orangefs_obj *acache_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct acache_orangefs_attribute, attr);
+ acache_orangefs_obj =
+ container_of(kobj, struct acache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(acache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct acache_orangefs_attribute *attribute;
+ struct acache_orangefs_obj *acache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "acache_orangefs_attr_store: start\n");
+
+ attribute = container_of(attr, struct acache_orangefs_attribute, attr);
+ acache_orangefs_obj =
+ container_of(kobj, struct acache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops acache_orangefs_sysfs_ops = {
+ .show = acache_orangefs_attr_show,
+ .store = acache_orangefs_attr_store,
+};
+
+static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct capcache_orangefs_attribute *attribute;
+ struct capcache_orangefs_obj *capcache_orangefs_obj;
+ int rc;
+
+ attribute =
+ container_of(attr, struct capcache_orangefs_attribute, attr);
+ capcache_orangefs_obj =
+ container_of(kobj, struct capcache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(capcache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct capcache_orangefs_attribute *attribute;
+ struct capcache_orangefs_obj *capcache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "capcache_orangefs_attr_store: start\n");
+
+ attribute =
+ container_of(attr, struct capcache_orangefs_attribute, attr);
+ capcache_orangefs_obj =
+ container_of(kobj, struct capcache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
+ .show = capcache_orangefs_attr_show,
+ .store = capcache_orangefs_attr_store,
+};
+
+static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct ccache_orangefs_attribute *attribute;
+ struct ccache_orangefs_obj *ccache_orangefs_obj;
+ int rc;
+
+ attribute =
+ container_of(attr, struct ccache_orangefs_attribute, attr);
+ ccache_orangefs_obj =
+ container_of(kobj, struct ccache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(ccache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct ccache_orangefs_attribute *attribute;
+ struct ccache_orangefs_obj *ccache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "ccache_orangefs_attr_store: start\n");
+
+ attribute =
+ container_of(attr, struct ccache_orangefs_attribute, attr);
+ ccache_orangefs_obj =
+ container_of(kobj, struct ccache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
+ .show = ccache_orangefs_attr_show,
+ .store = ccache_orangefs_attr_store,
+};
+
+static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct ncache_orangefs_attribute *attribute;
+ struct ncache_orangefs_obj *ncache_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
+ ncache_orangefs_obj =
+ container_of(kobj, struct ncache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(ncache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct ncache_orangefs_attribute *attribute;
+ struct ncache_orangefs_obj *ncache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "ncache_orangefs_attr_store: start\n");
+
+ attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
+ ncache_orangefs_obj =
+ container_of(kobj, struct ncache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
+ .show = ncache_orangefs_attr_show,
+ .store = ncache_orangefs_attr_store,
+};
+
+static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct pc_orangefs_attribute *attribute;
+ struct pc_orangefs_obj *pc_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct pc_orangefs_attribute, attr);
+ pc_orangefs_obj =
+ container_of(kobj, struct pc_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(pc_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops pc_orangefs_sysfs_ops = {
+ .show = pc_orangefs_attr_show,
+};
+
+static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct stats_orangefs_attribute *attribute;
+ struct stats_orangefs_obj *stats_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct stats_orangefs_attribute, attr);
+ stats_orangefs_obj =
+ container_of(kobj, struct stats_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(stats_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops stats_orangefs_sysfs_ops = {
+ .show = stats_orangefs_attr_show,
+};
+
+static void orangefs_release(struct kobject *kobj)
+{
+ struct orangefs_obj *orangefs_obj;
+
+ orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+ kfree(orangefs_obj);
+}
+
+static void acache_orangefs_release(struct kobject *kobj)
+{
+ struct acache_orangefs_obj *acache_orangefs_obj;
+
+ acache_orangefs_obj =
+ container_of(kobj, struct acache_orangefs_obj, kobj);
+ kfree(acache_orangefs_obj);
+}
+
+static void capcache_orangefs_release(struct kobject *kobj)
+{
+ struct capcache_orangefs_obj *capcache_orangefs_obj;
+
+ capcache_orangefs_obj =
+ container_of(kobj, struct capcache_orangefs_obj, kobj);
+ kfree(capcache_orangefs_obj);
+}
+
+static void ccache_orangefs_release(struct kobject *kobj)
+{
+ struct ccache_orangefs_obj *ccache_orangefs_obj;
+
+ ccache_orangefs_obj =
+ container_of(kobj, struct ccache_orangefs_obj, kobj);
+ kfree(ccache_orangefs_obj);
+}
+
+static void ncache_orangefs_release(struct kobject *kobj)
+{
+ struct ncache_orangefs_obj *ncache_orangefs_obj;
+
+ ncache_orangefs_obj =
+ container_of(kobj, struct ncache_orangefs_obj, kobj);
+ kfree(ncache_orangefs_obj);
+}
+
+static void pc_orangefs_release(struct kobject *kobj)
+{
+ struct pc_orangefs_obj *pc_orangefs_obj;
+
+ pc_orangefs_obj =
+ container_of(kobj, struct pc_orangefs_obj, kobj);
+ kfree(pc_orangefs_obj);
+}
+
+static void stats_orangefs_release(struct kobject *kobj)
+{
+ struct stats_orangefs_obj *stats_orangefs_obj;
+
+ stats_orangefs_obj =
+ container_of(kobj, struct stats_orangefs_obj, kobj);
+ kfree(stats_orangefs_obj);
+}
+
+static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
+{
+ int rc = -EIO;
+ struct orangefs_attribute *orangefs_attr;
+ struct stats_orangefs_attribute *stats_orangefs_attr;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
+
+ if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+ orangefs_attr = (struct orangefs_attribute *)attr;
+
+ if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ op_timeout_secs);
+ goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "slot_timeout_secs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ slot_timeout_secs);
+ goto out;
+ } else {
+ goto out;
+ }
+
+ } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
+ stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
+
+ if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%lu\n",
+ g_orangefs_stats.reads);
+ goto out;
+ } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%lu\n",
+ g_orangefs_stats.writes);
+ goto out;
+ } else {
+ goto out;
+ }
+ }
+
+out:
+
+ return rc;
+}
+
+static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ char *buf)
+{
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "int_orangefs_show:start attr->attr.name:%s:\n",
+ attr->attr.name);
+
+ rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
+
+ return rc;
+}
+
+static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
+ struct stats_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "int_stats_show:start attr->attr.name:%s:\n",
+ attr->attr.name);
+
+ rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
+
+ return rc;
+}
+
+static ssize_t int_store(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "int_store: start attr->attr.name:%s: buf:%s:\n",
+ attr->attr.name, buf);
+
+ if (!strcmp(attr->attr.name, "op_timeout_secs")) {
+ rc = kstrtoint(buf, 0, &op_timeout_secs);
+ goto out;
+ } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
+ rc = kstrtoint(buf, 0, &slot_timeout_secs);
+ goto out;
+ } else {
+ goto out;
+ }
+
+out:
+ if (rc)
+ rc = -EINVAL;
+ else
+ rc = count;
+
+ return rc;
+}
+
+/*
+ * obtain attribute values from userspace with a service operation.
+ */
+static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
+{
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int rc = 0;
+ char *ser_op_type = NULL;
+ struct orangefs_attribute *orangefs_attr;
+ struct acache_orangefs_attribute *acache_attr;
+ struct capcache_orangefs_attribute *capcache_attr;
+ struct ccache_orangefs_attribute *ccache_attr;
+ struct ncache_orangefs_attribute *ncache_attr;
+ struct pc_orangefs_attribute *pc_attr;
+ __u32 op_alloc_type;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "sysfs_service_op_show: id:%s:\n",
+ kobj_id);
+
+ if (strcmp(kobj_id, PC_KOBJ_ID))
+ op_alloc_type = ORANGEFS_VFS_OP_PARAM;
+ else
+ op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
+
+ new_op = op_alloc(op_alloc_type);
+ if (!new_op)
+ return -ENOMEM;
+
+ /* Can't do a service_operation if the client is not running... */
+ rc = is_daemon_in_service();
+ if (rc) {
+ pr_info("%s: Client not running :%d:\n",
+ __func__,
+ is_daemon_in_service());
+ goto out;
+ }
+
+ if (strcmp(kobj_id, PC_KOBJ_ID))
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
+
+ if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+ orangefs_attr = (struct orangefs_attribute *)attr;
+
+ if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
+ else if (!strcmp(orangefs_attr->attr.name,
+ "perf_time_interval_secs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
+ else if (!strcmp(orangefs_attr->attr.name,
+ "perf_counter_reset"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+
+ } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
+ acache_attr = (struct acache_orangefs_attribute *)attr;
+
+ if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
+
+ if (!strcmp(acache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
+
+ if (!strcmp(acache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
+
+ if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
+ capcache_attr = (struct capcache_orangefs_attribute *)attr;
+
+ if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
+
+ if (!strcmp(capcache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
+
+ if (!strcmp(capcache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
+
+ if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
+ ccache_attr = (struct ccache_orangefs_attribute *)attr;
+
+ if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
+
+ if (!strcmp(ccache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
+
+ if (!strcmp(ccache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
+
+ if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
+ ncache_attr = (struct ncache_orangefs_attribute *)attr;
+
+ if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
+
+ if (!strcmp(ncache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
+
+ if (!strcmp(ncache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
+
+ if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
+ pc_attr = (struct pc_orangefs_attribute *)attr;
+
+ if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
+ new_op->upcall.req.perf_count.type =
+ ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
+
+ if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
+ new_op->upcall.req.perf_count.type =
+ ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
+
+ if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
+ new_op->upcall.req.perf_count.type =
+ ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
+
+ } else {
+ gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
+ kobj_id);
+ rc = -EINVAL;
+ goto out;
+ }
+
+
+ if (strcmp(kobj_id, PC_KOBJ_ID))
+ ser_op_type = "orangefs_param";
+ else
+ ser_op_type = "orangefs_perf_count";
+
+ /*
+ * The service_operation will return an errno return code on
+ * error, and zero on success.
+ */
+ rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE);
+
+out:
+ if (!rc) {
+ if (strcmp(kobj_id, PC_KOBJ_ID)) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ (int)new_op->downcall.resp.param.value);
+ } else {
+ rc = scnprintf(
+ buf,
+ PAGE_SIZE,
+ "%s",
+ new_op->downcall.resp.perf_count.buffer);
+ }
+ }
+
+ op_release(new_op);
+
+ return rc;
+
+}
+
+static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t
+ service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t service_capcache_show(struct capcache_orangefs_obj
+ *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t service_ccache_show(struct ccache_orangefs_obj
+ *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t
+ service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t
+ service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
+ struct pc_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+/*
+ * pass attribute values back to userspace with a service operation.
+ *
+ * We have to do a memory allocation, an sscanf and a service operation.
+ * And we have to evaluate what the user entered, to make sure the
+ * value is within the range supported by the attribute. So, there's
+ * a lot of return code checking and mapping going on here.
+ *
+ * We want to return 1 if we think everything went OK, and
+ * EINVAL if not.
+ */
+static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
+{
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int val = 0;
+ int rc = 0;
+ struct orangefs_attribute *orangefs_attr;
+ struct acache_orangefs_attribute *acache_attr;
+ struct capcache_orangefs_attribute *capcache_attr;
+ struct ccache_orangefs_attribute *ccache_attr;
+ struct ncache_orangefs_attribute *ncache_attr;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "sysfs_service_op_store: id:%s:\n",
+ kobj_id);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+ if (!new_op)
+ return -EINVAL; /* sic */
+
+ /* Can't do a service_operation if the client is not running... */
+ rc = is_daemon_in_service();
+ if (rc) {
+ pr_info("%s: Client not running :%d:\n",
+ __func__,
+ is_daemon_in_service());
+ goto out;
+ }
+
+ /*
+ * The value we want to send back to userspace is in buf.
+ */
+ rc = kstrtoint(buf, 0, &val);
+ if (rc)
+ goto out;
+
+ if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+ orangefs_attr = (struct orangefs_attribute *)attr;
+
+ if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
+ if (val > 0) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "perf_time_interval_secs")) {
+ if (val > 0) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "perf_counter_reset")) {
+ if ((val == 0) || (val == 1)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
+ acache_attr = (struct acache_orangefs_attribute *)attr;
+
+ if (!strcmp(acache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(acache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
+ capcache_attr = (struct capcache_orangefs_attribute *)attr;
+
+ if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(capcache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
+ ccache_attr = (struct ccache_orangefs_attribute *)attr;
+
+ if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ccache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
+ ncache_attr = (struct ncache_orangefs_attribute *)attr;
+
+ if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ncache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else {
+ gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
+ kobj_id);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+
+ new_op->upcall.req.param.value = val;
+
+ /*
+ * The service_operation will return a errno return code on
+ * error, and zero on success.
+ */
+ rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE);
+
+ if (rc < 0) {
+ gossip_err("sysfs_service_op_store: service op returned:%d:\n",
+ rc);
+ rc = 0;
+ } else {
+ rc = 1;
+ }
+
+out:
+ op_release(new_op);
+
+ if (rc == -ENOMEM || rc == 0)
+ rc = -EINVAL;
+
+ return rc;
+}
+
+static ssize_t
+ service_orangefs_store(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t
+ service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t
+ service_capcache_store(struct capcache_orangefs_obj
+ *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t service_ccache_store(struct ccache_orangefs_obj
+ *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t
+ service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static struct orangefs_attribute op_timeout_secs_attribute =
+ __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute slot_timeout_secs_attribute =
+ __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute perf_counter_reset_attribute =
+ __ATTR(perf_counter_reset,
+ 0664,
+ service_orangefs_show,
+ service_orangefs_store);
+
+static struct orangefs_attribute perf_history_size_attribute =
+ __ATTR(perf_history_size,
+ 0664,
+ service_orangefs_show,
+ service_orangefs_store);
+
+static struct orangefs_attribute perf_time_interval_secs_attribute =
+ __ATTR(perf_time_interval_secs,
+ 0664,
+ service_orangefs_show,
+ service_orangefs_store);
+
+static struct attribute *orangefs_default_attrs[] = {
+ &op_timeout_secs_attribute.attr,
+ &slot_timeout_secs_attribute.attr,
+ &perf_counter_reset_attribute.attr,
+ &perf_history_size_attribute.attr,
+ &perf_time_interval_secs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type orangefs_ktype = {
+ .sysfs_ops = &orangefs_sysfs_ops,
+ .release = orangefs_release,
+ .default_attrs = orangefs_default_attrs,
+};
+
+static struct acache_orangefs_attribute acache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct acache_orangefs_attribute acache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
+ __ATTR(timeout_msecs,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct attribute *acache_orangefs_default_attrs[] = {
+ &acache_hard_limit_attribute.attr,
+ &acache_reclaim_percent_attribute.attr,
+ &acache_soft_limit_attribute.attr,
+ &acache_timeout_msecs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type acache_orangefs_ktype = {
+ .sysfs_ops = &acache_orangefs_sysfs_ops,
+ .release = acache_orangefs_release,
+ .default_attrs = acache_orangefs_default_attrs,
+};
+
+static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
+ __ATTR(timeout_secs,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct attribute *capcache_orangefs_default_attrs[] = {
+ &capcache_hard_limit_attribute.attr,
+ &capcache_reclaim_percent_attribute.attr,
+ &capcache_soft_limit_attribute.attr,
+ &capcache_timeout_secs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type capcache_orangefs_ktype = {
+ .sysfs_ops = &capcache_orangefs_sysfs_ops,
+ .release = capcache_orangefs_release,
+ .default_attrs = capcache_orangefs_default_attrs,
+};
+
+static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
+ __ATTR(timeout_secs,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct attribute *ccache_orangefs_default_attrs[] = {
+ &ccache_hard_limit_attribute.attr,
+ &ccache_reclaim_percent_attribute.attr,
+ &ccache_soft_limit_attribute.attr,
+ &ccache_timeout_secs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type ccache_orangefs_ktype = {
+ .sysfs_ops = &ccache_orangefs_sysfs_ops,
+ .release = ccache_orangefs_release,
+ .default_attrs = ccache_orangefs_default_attrs,
+};
+
+static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
+ __ATTR(timeout_msecs,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct attribute *ncache_orangefs_default_attrs[] = {
+ &ncache_hard_limit_attribute.attr,
+ &ncache_reclaim_percent_attribute.attr,
+ &ncache_soft_limit_attribute.attr,
+ &ncache_timeout_msecs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type ncache_orangefs_ktype = {
+ .sysfs_ops = &ncache_orangefs_sysfs_ops,
+ .release = ncache_orangefs_release,
+ .default_attrs = ncache_orangefs_default_attrs,
+};
+
+static struct pc_orangefs_attribute pc_acache_attribute =
+ __ATTR(acache,
+ 0664,
+ service_pc_show,
+ NULL);
+
+static struct pc_orangefs_attribute pc_capcache_attribute =
+ __ATTR(capcache,
+ 0664,
+ service_pc_show,
+ NULL);
+
+static struct pc_orangefs_attribute pc_ncache_attribute =
+ __ATTR(ncache,
+ 0664,
+ service_pc_show,
+ NULL);
+
+static struct attribute *pc_orangefs_default_attrs[] = {
+ &pc_acache_attribute.attr,
+ &pc_capcache_attribute.attr,
+ &pc_ncache_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type pc_orangefs_ktype = {
+ .sysfs_ops = &pc_orangefs_sysfs_ops,
+ .release = pc_orangefs_release,
+ .default_attrs = pc_orangefs_default_attrs,
+};
+
+static struct stats_orangefs_attribute stats_reads_attribute =
+ __ATTR(reads,
+ 0664,
+ int_stats_show,
+ NULL);
+
+static struct stats_orangefs_attribute stats_writes_attribute =
+ __ATTR(writes,
+ 0664,
+ int_stats_show,
+ NULL);
+
+static struct attribute *stats_orangefs_default_attrs[] = {
+ &stats_reads_attribute.attr,
+ &stats_writes_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type stats_orangefs_ktype = {
+ .sysfs_ops = &stats_orangefs_sysfs_ops,
+ .release = stats_orangefs_release,
+ .default_attrs = stats_orangefs_default_attrs,
+};
+
+static struct orangefs_obj *orangefs_obj;
+static struct acache_orangefs_obj *acache_orangefs_obj;
+static struct capcache_orangefs_obj *capcache_orangefs_obj;
+static struct ccache_orangefs_obj *ccache_orangefs_obj;
+static struct ncache_orangefs_obj *ncache_orangefs_obj;
+static struct pc_orangefs_obj *pc_orangefs_obj;
+static struct stats_orangefs_obj *stats_orangefs_obj;
+
+int orangefs_sysfs_init(void)
+{
+ int rc = -EINVAL;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n");
+
+ /* create /sys/fs/orangefs. */
+ orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL);
+ if (!orangefs_obj)
+ goto out;
+
+ rc = kobject_init_and_add(&orangefs_obj->kobj,
+ &orangefs_ktype,
+ fs_kobj,
+ ORANGEFS_KOBJ_ID);
+
+ if (rc)
+ goto ofs_obj_bail;
+
+ kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/acache. */
+ acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
+ if (!acache_orangefs_obj) {
+ rc = -EINVAL;
+ goto ofs_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
+ &acache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ ACACHE_KOBJ_ID);
+
+ if (rc)
+ goto acache_obj_bail;
+
+ kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/capcache. */
+ capcache_orangefs_obj =
+ kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL);
+ if (!capcache_orangefs_obj) {
+ rc = -EINVAL;
+ goto acache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
+ &capcache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ CAPCACHE_KOBJ_ID);
+ if (rc)
+ goto capcache_obj_bail;
+
+ kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/ccache. */
+ ccache_orangefs_obj =
+ kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL);
+ if (!ccache_orangefs_obj) {
+ rc = -EINVAL;
+ goto capcache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
+ &ccache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ CCACHE_KOBJ_ID);
+ if (rc)
+ goto ccache_obj_bail;
+
+ kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/ncache. */
+ ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
+ if (!ncache_orangefs_obj) {
+ rc = -EINVAL;
+ goto ccache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
+ &ncache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ NCACHE_KOBJ_ID);
+
+ if (rc)
+ goto ncache_obj_bail;
+
+ kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/perf_counters. */
+ pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
+ if (!pc_orangefs_obj) {
+ rc = -EINVAL;
+ goto ncache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
+ &pc_orangefs_ktype,
+ &orangefs_obj->kobj,
+ "perf_counters");
+
+ if (rc)
+ goto pc_obj_bail;
+
+ kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/stats. */
+ stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
+ if (!stats_orangefs_obj) {
+ rc = -EINVAL;
+ goto pc_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
+ &stats_orangefs_ktype,
+ &orangefs_obj->kobj,
+ STATS_KOBJ_ID);
+
+ if (rc)
+ goto stats_obj_bail;
+
+ kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
+ goto out;
+
+stats_obj_bail:
+ kobject_put(&stats_orangefs_obj->kobj);
+
+pc_obj_bail:
+ kobject_put(&pc_orangefs_obj->kobj);
+
+ncache_obj_bail:
+ kobject_put(&ncache_orangefs_obj->kobj);
+
+ccache_obj_bail:
+ kobject_put(&ccache_orangefs_obj->kobj);
+
+capcache_obj_bail:
+ kobject_put(&capcache_orangefs_obj->kobj);
+
+acache_obj_bail:
+ kobject_put(&acache_orangefs_obj->kobj);
+
+ofs_obj_bail:
+ kobject_put(&orangefs_obj->kobj);
+out:
+ return rc;
+}
+
+void orangefs_sysfs_exit(void)
+{
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
+
+ kobject_put(&acache_orangefs_obj->kobj);
+ kobject_put(&capcache_orangefs_obj->kobj);
+ kobject_put(&ccache_orangefs_obj->kobj);
+ kobject_put(&ncache_orangefs_obj->kobj);
+ kobject_put(&pc_orangefs_obj->kobj);
+ kobject_put(&stats_orangefs_obj->kobj);
+
+ kobject_put(&orangefs_obj->kobj);
+}
diff --git a/fs/orangefs/orangefs-sysfs.h b/fs/orangefs/orangefs-sysfs.h
new file mode 100644
index 000000000000..f0b76382db02
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.h
@@ -0,0 +1,2 @@
+extern int orangefs_sysfs_init(void);
+extern void orangefs_sysfs_exit(void);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
new file mode 100644
index 000000000000..40f5163b56aa
--- /dev/null
+++ b/fs/orangefs/orangefs-utils.c
@@ -0,0 +1,1048 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+
+__s32 fsid_of_op(struct orangefs_kernel_op_s *op)
+{
+ __s32 fsid = ORANGEFS_FS_ID_NULL;
+
+ if (op) {
+ switch (op->upcall.type) {
+ case ORANGEFS_VFS_OP_FILE_IO:
+ fsid = op->upcall.req.io.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_LOOKUP:
+ fsid = op->upcall.req.lookup.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_CREATE:
+ fsid = op->upcall.req.create.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_GETATTR:
+ fsid = op->upcall.req.getattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_REMOVE:
+ fsid = op->upcall.req.remove.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_MKDIR:
+ fsid = op->upcall.req.mkdir.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_READDIR:
+ fsid = op->upcall.req.readdir.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_SETATTR:
+ fsid = op->upcall.req.setattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_SYMLINK:
+ fsid = op->upcall.req.sym.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_RENAME:
+ fsid = op->upcall.req.rename.old_parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_STATFS:
+ fsid = op->upcall.req.statfs.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_TRUNCATE:
+ fsid = op->upcall.req.truncate.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+ fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_FS_UMOUNT:
+ fsid = op->upcall.req.fs_umount.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_GETXATTR:
+ fsid = op->upcall.req.getxattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_SETXATTR:
+ fsid = op->upcall.req.setxattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_LISTXATTR:
+ fsid = op->upcall.req.listxattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_REMOVEXATTR:
+ fsid = op->upcall.req.removexattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_FSYNC:
+ fsid = op->upcall.req.fsync.refn.fs_id;
+ break;
+ default:
+ break;
+ }
+ }
+ return fsid;
+}
+
+static int orangefs_inode_flags(struct ORANGEFS_sys_attr_s *attrs)
+{
+ int flags = 0;
+ if (attrs->flags & ORANGEFS_IMMUTABLE_FL)
+ flags |= S_IMMUTABLE;
+ else
+ flags &= ~S_IMMUTABLE;
+ if (attrs->flags & ORANGEFS_APPEND_FL)
+ flags |= S_APPEND;
+ else
+ flags &= ~S_APPEND;
+ if (attrs->flags & ORANGEFS_NOATIME_FL)
+ flags |= S_NOATIME;
+ else
+ flags &= ~S_NOATIME;
+ return flags;
+}
+
+static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs)
+{
+ int perm_mode = 0;
+
+ if (attrs->perms & ORANGEFS_O_EXECUTE)
+ perm_mode |= S_IXOTH;
+ if (attrs->perms & ORANGEFS_O_WRITE)
+ perm_mode |= S_IWOTH;
+ if (attrs->perms & ORANGEFS_O_READ)
+ perm_mode |= S_IROTH;
+
+ if (attrs->perms & ORANGEFS_G_EXECUTE)
+ perm_mode |= S_IXGRP;
+ if (attrs->perms & ORANGEFS_G_WRITE)
+ perm_mode |= S_IWGRP;
+ if (attrs->perms & ORANGEFS_G_READ)
+ perm_mode |= S_IRGRP;
+
+ if (attrs->perms & ORANGEFS_U_EXECUTE)
+ perm_mode |= S_IXUSR;
+ if (attrs->perms & ORANGEFS_U_WRITE)
+ perm_mode |= S_IWUSR;
+ if (attrs->perms & ORANGEFS_U_READ)
+ perm_mode |= S_IRUSR;
+
+ if (attrs->perms & ORANGEFS_G_SGID)
+ perm_mode |= S_ISGID;
+ if (attrs->perms & ORANGEFS_U_SUID)
+ perm_mode |= S_ISUID;
+
+ return perm_mode;
+}
+
+/*
+ * NOTE: in kernel land, we never use the sys_attr->link_target for
+ * anything, so don't bother copying it into the sys_attr object here.
+ */
+static inline int copy_attributes_from_inode(struct inode *inode,
+ struct ORANGEFS_sys_attr_s *attrs,
+ struct iattr *iattr)
+{
+ umode_t tmp_mode;
+
+ if (!iattr || !inode || !attrs) {
+ gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
+ "in copy_attributes_from_inode!\n",
+ iattr,
+ inode,
+ attrs);
+ return -EINVAL;
+ }
+ /*
+ * We need to be careful to only copy the attributes out of the
+ * iattr object that we know are valid.
+ */
+ attrs->mask = 0;
+ if (iattr->ia_valid & ATTR_UID) {
+ attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid);
+ attrs->mask |= ORANGEFS_ATTR_SYS_UID;
+ gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
+ }
+ if (iattr->ia_valid & ATTR_GID) {
+ attrs->group = from_kgid(current_user_ns(), iattr->ia_gid);
+ attrs->mask |= ORANGEFS_ATTR_SYS_GID;
+ gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
+ }
+
+ if (iattr->ia_valid & ATTR_ATIME) {
+ attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
+ if (iattr->ia_valid & ATTR_ATIME_SET) {
+ attrs->atime = (time64_t)iattr->ia_atime.tv_sec;
+ attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
+ }
+ }
+ if (iattr->ia_valid & ATTR_MTIME) {
+ attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
+ if (iattr->ia_valid & ATTR_MTIME_SET) {
+ attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec;
+ attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
+ }
+ }
+ if (iattr->ia_valid & ATTR_CTIME)
+ attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;
+
+ /*
+ * ORANGEFS cannot set size with a setattr operation. Probably not likely
+ * to be requested through the VFS, but just in case, don't worry about
+ * ATTR_SIZE
+ */
+
+ if (iattr->ia_valid & ATTR_MODE) {
+ tmp_mode = iattr->ia_mode;
+ if (tmp_mode & (S_ISVTX)) {
+ if (is_root_handle(inode)) {
+ /*
+ * allow sticky bit to be set on root (since
+ * it shows up that way by default anyhow),
+ * but don't show it to the server
+ */
+ tmp_mode -= S_ISVTX;
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+ return -EINVAL;
+ }
+ }
+
+ if (tmp_mode & (S_ISUID)) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+ return -EINVAL;
+ }
+
+ attrs->perms = ORANGEFS_util_translate_mode(tmp_mode);
+ attrs->mask |= ORANGEFS_ATTR_SYS_PERM;
+ }
+
+ return 0;
+}
+
+static int orangefs_inode_type(enum orangefs_ds_type objtype)
+{
+ if (objtype == ORANGEFS_TYPE_METAFILE)
+ return S_IFREG;
+ else if (objtype == ORANGEFS_TYPE_DIRECTORY)
+ return S_IFDIR;
+ else if (objtype == ORANGEFS_TYPE_SYMLINK)
+ return S_IFLNK;
+ else
+ return -1;
+}
+
+static int orangefs_inode_is_stale(struct inode *inode, int new,
+ struct ORANGEFS_sys_attr_s *attrs, char *link_target)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ int type = orangefs_inode_type(attrs->objtype);
+ if (!new) {
+ /*
+ * If the inode type or symlink target have changed then this
+ * inode is stale.
+ */
+ if (type == -1 || !(inode->i_mode & type)) {
+ orangefs_make_bad_inode(inode);
+ return 1;
+ }
+ if (type == S_IFLNK && strncmp(orangefs_inode->link_target,
+ link_target, ORANGEFS_NAME_MAX)) {
+ orangefs_make_bad_inode(inode);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int orangefs_inode_getattr(struct inode *inode, int new, int size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ loff_t inode_size, rounded_up_size;
+ int ret, type;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
+ get_khandle_from_ino(inode));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.getattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.getattr.mask = size ?
+ ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto out;
+
+ type = orangefs_inode_type(new_op->
+ downcall.resp.getattr.attributes.objtype);
+ ret = orangefs_inode_is_stale(inode, new,
+ &new_op->downcall.resp.getattr.attributes,
+ new_op->downcall.resp.getattr.link_target);
+ if (ret) {
+ ret = -ESTALE;
+ goto out;
+ }
+
+ switch (type) {
+ case S_IFREG:
+ inode->i_flags = orangefs_inode_flags(&new_op->
+ downcall.resp.getattr.attributes);
+ if (size) {
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
+ }
+ break;
+ case S_IFDIR:
+ inode->i_size = PAGE_CACHE_SIZE;
+ orangefs_inode->blksize = (1 << inode->i_blkbits);
+ spin_lock(&inode->i_lock);
+ inode_set_bytes(inode, inode->i_size);
+ spin_unlock(&inode->i_lock);
+ set_nlink(inode, 1);
+ break;
+ case S_IFLNK:
+ if (new) {
+ inode->i_size = (loff_t)strlen(new_op->
+ downcall.resp.getattr.link_target);
+ orangefs_inode->blksize = (1 << inode->i_blkbits);
+ strlcpy(orangefs_inode->link_target,
+ new_op->downcall.resp.getattr.link_target,
+ ORANGEFS_NAME_MAX);
+ inode->i_link = orangefs_inode->link_target;
+ }
+ break;
+ }
+
+ inode->i_uid = make_kuid(&init_user_ns, new_op->
+ downcall.resp.getattr.attributes.owner);
+ inode->i_gid = make_kgid(&init_user_ns, new_op->
+ downcall.resp.getattr.attributes.group);
+ inode->i_atime.tv_sec = (time64_t)new_op->
+ downcall.resp.getattr.attributes.atime;
+ inode->i_mtime.tv_sec = (time64_t)new_op->
+ downcall.resp.getattr.attributes.mtime;
+ inode->i_ctime.tv_sec = (time64_t)new_op->
+ downcall.resp.getattr.attributes.ctime;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+
+ /* special case: mark the root inode as sticky */
+ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
+ orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+
+ ret = 0;
+out:
+ op_release(new_op);
+ return ret;
+}
+
+int orangefs_inode_check_changed(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
+ get_khandle_from_ino(inode));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.getattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_TYPE |
+ ORANGEFS_ATTR_SYS_LNK_TARGET;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto out;
+
+ ret = orangefs_inode_is_stale(inode, 0,
+ &new_op->downcall.resp.getattr.attributes,
+ new_op->downcall.resp.getattr.link_target);
+out:
+ op_release(new_op);
+ return ret;
+}
+
+/*
+ * issues a orangefs setattr request to make sure the new attribute values
+ * take effect if successful. returns 0 on success; -errno otherwise
+ */
+int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_SETATTR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.setattr.refn = orangefs_inode->refn;
+ ret = copy_attributes_from_inode(inode,
+ &new_op->upcall.req.setattr.attributes,
+ iattr);
+ if (ret >= 0) {
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_inode_setattr: returning %d\n",
+ ret);
+ }
+
+ op_release(new_op);
+
+ /*
+ * successful setattr should clear the atime, mtime and
+ * ctime flags.
+ */
+ if (ret == 0) {
+ ClearAtimeFlag(orangefs_inode);
+ ClearMtimeFlag(orangefs_inode);
+ ClearCtimeFlag(orangefs_inode);
+ ClearModeFlag(orangefs_inode);
+ }
+
+ return ret;
+}
+
+int orangefs_flush_inode(struct inode *inode)
+{
+ /*
+ * If it is a dirty inode, this function gets called.
+ * Gather all the information that needs to be setattr'ed
+ * Right now, this will only be used for mode, atime, mtime
+ * and/or ctime.
+ */
+ struct iattr wbattr;
+ int ret;
+ int mtime_flag;
+ int ctime_flag;
+ int atime_flag;
+ int mode_flag;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+
+ memset(&wbattr, 0, sizeof(wbattr));
+
+ /*
+ * check inode flags up front, and clear them if they are set. This
+ * will prevent multiple processes from all trying to flush the same
+ * inode if they call close() simultaneously
+ */
+ mtime_flag = MtimeFlag(orangefs_inode);
+ ClearMtimeFlag(orangefs_inode);
+ ctime_flag = CtimeFlag(orangefs_inode);
+ ClearCtimeFlag(orangefs_inode);
+ atime_flag = AtimeFlag(orangefs_inode);
+ ClearAtimeFlag(orangefs_inode);
+ mode_flag = ModeFlag(orangefs_inode);
+ ClearModeFlag(orangefs_inode);
+
+ /* -- Lazy atime,mtime and ctime update --
+ * Note: all times are dictated by server in the new scheme
+ * and not by the clients
+ *
+ * Also mode updates are being handled now..
+ */
+
+ if (mtime_flag)
+ wbattr.ia_valid |= ATTR_MTIME;
+ if (ctime_flag)
+ wbattr.ia_valid |= ATTR_CTIME;
+ if (atime_flag)
+ wbattr.ia_valid |= ATTR_ATIME;
+
+ if (mode_flag) {
+ wbattr.ia_mode = inode->i_mode;
+ wbattr.ia_valid |= ATTR_MODE;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*********** orangefs_flush_inode: %pU "
+ "(ia_valid %d)\n",
+ get_khandle_from_ino(inode),
+ wbattr.ia_valid);
+ if (wbattr.ia_valid == 0) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_flush_inode skipping setattr()\n");
+ return 0;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_flush_inode (%pU) writing mode %o\n",
+ get_khandle_from_ino(inode),
+ inode->i_mode);
+
+ ret = orangefs_inode_setattr(inode, &wbattr);
+
+ return ret;
+}
+
+int orangefs_unmount_sb(struct super_block *sb)
+{
+ int ret = -EINVAL;
+ struct orangefs_kernel_op_s *new_op = NULL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_unmount_sb called on sb %p\n",
+ sb);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
+ new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
+ strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
+ ORANGEFS_SB(sb)->devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting ORANGEFS Unmount via host %s\n",
+ new_op->upcall.req.fs_umount.orangefs_config_server);
+
+ ret = service_operation(new_op, "orangefs_fs_umount", 0);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_unmount: got return value of %d\n", ret);
+ if (ret)
+ sb = ERR_PTR(ret);
+ else
+ ORANGEFS_SB(sb)->mount_pending = 1;
+
+ op_release(new_op);
+ return ret;
+}
+
+void orangefs_make_bad_inode(struct inode *inode)
+{
+ if (is_root_handle(inode)) {
+ /*
+ * if this occurs, the pvfs2-client-core was killed but we
+ * can't afford to lose the inode operations and such
+ * associated with the root handle in any case.
+ */
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*** NOT making bad root inode %pU\n",
+ get_khandle_from_ino(inode));
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*** making bad inode %pU\n",
+ get_khandle_from_ino(inode));
+ make_bad_inode(inode);
+ }
+}
+
+/*
+ * The following is a very dirty hack that is now a permanent part of the
+ * ORANGEFS protocol. See protocol.h for more error definitions.
+ */
+
+/* The order matches include/orangefs-types.h in the OrangeFS source. */
+static int PINT_errno_mapping[] = {
+ 0, EPERM, ENOENT, EINTR, EIO, ENXIO, EBADF, EAGAIN, ENOMEM,
+ EFAULT, EBUSY, EEXIST, ENODEV, ENOTDIR, EISDIR, EINVAL, EMFILE,
+ EFBIG, ENOSPC, EROFS, EMLINK, EPIPE, EDEADLK, ENAMETOOLONG,
+ ENOLCK, ENOSYS, ENOTEMPTY, ELOOP, EWOULDBLOCK, ENOMSG, EUNATCH,
+ EBADR, EDEADLOCK, ENODATA, ETIME, ENONET, EREMOTE, ECOMM,
+ EPROTO, EBADMSG, EOVERFLOW, ERESTART, EMSGSIZE, EPROTOTYPE,
+ ENOPROTOOPT, EPROTONOSUPPORT, EOPNOTSUPP, EADDRINUSE,
+ EADDRNOTAVAIL, ENETDOWN, ENETUNREACH, ENETRESET, ENOBUFS,
+ ETIMEDOUT, ECONNREFUSED, EHOSTDOWN, EHOSTUNREACH, EALREADY,
+ EACCES, ECONNRESET, ERANGE
+};
+
+int orangefs_normalize_to_errno(__s32 error_code)
+{
+ __u32 i;
+
+ /* Success */
+ if (error_code == 0) {
+ return 0;
+ /*
+ * This shouldn't ever happen. If it does it should be fixed on the
+ * server.
+ */
+ } else if (error_code > 0) {
+ gossip_err("orangefs: error status receieved.\n");
+ gossip_err("orangefs: assuming error code is inverted.\n");
+ error_code = -error_code;
+ }
+
+ /*
+ * XXX: This is very bad since error codes from ORANGEFS may not be
+ * suitable for return into userspace.
+ */
+
+ /*
+ * Convert ORANGEFS error values into errno values suitable for return
+ * from the kernel.
+ */
+ if ((-error_code) & ORANGEFS_NON_ERRNO_ERROR_BIT) {
+ if (((-error_code) &
+ (ORANGEFS_ERROR_NUMBER_BITS|ORANGEFS_NON_ERRNO_ERROR_BIT|
+ ORANGEFS_ERROR_BIT)) == ORANGEFS_ECANCEL) {
+ /*
+ * cancellation error codes generally correspond to
+ * a timeout from the client's perspective
+ */
+ error_code = -ETIMEDOUT;
+ } else {
+ /* assume a default error code */
+ gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code);
+ error_code = -EINVAL;
+ }
+
+ /* Convert ORANGEFS encoded errno values into regular errno values. */
+ } else if ((-error_code) & ORANGEFS_ERROR_BIT) {
+ i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS);
+ if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping))
+ error_code = -PINT_errno_mapping[i];
+ else
+ error_code = -EINVAL;
+
+ /*
+ * Only ORANGEFS protocol error codes should ever come here. Otherwise
+ * there is a bug somewhere.
+ */
+ } else {
+ gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n");
+ }
+ return error_code;
+}
+
+#define NUM_MODES 11
+__s32 ORANGEFS_util_translate_mode(int mode)
+{
+ int ret = 0;
+ int i = 0;
+ static int modes[NUM_MODES] = {
+ S_IXOTH, S_IWOTH, S_IROTH,
+ S_IXGRP, S_IWGRP, S_IRGRP,
+ S_IXUSR, S_IWUSR, S_IRUSR,
+ S_ISGID, S_ISUID
+ };
+ static int orangefs_modes[NUM_MODES] = {
+ ORANGEFS_O_EXECUTE, ORANGEFS_O_WRITE, ORANGEFS_O_READ,
+ ORANGEFS_G_EXECUTE, ORANGEFS_G_WRITE, ORANGEFS_G_READ,
+ ORANGEFS_U_EXECUTE, ORANGEFS_U_WRITE, ORANGEFS_U_READ,
+ ORANGEFS_G_SGID, ORANGEFS_U_SUID
+ };
+
+ for (i = 0; i < NUM_MODES; i++)
+ if (mode & modes[i])
+ ret |= orangefs_modes[i];
+
+ return ret;
+}
+#undef NUM_MODES
+
+/*
+ * After obtaining a string representation of the client's debug
+ * keywords and their associated masks, this function is called to build an
+ * array of these values.
+ */
+int orangefs_prepare_cdm_array(char *debug_array_string)
+{
+ int i;
+ int rc = -EINVAL;
+ char *cds_head = NULL;
+ char *cds_delimiter = NULL;
+ int keyword_len = 0;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ /*
+ * figure out how many elements the cdm_array needs.
+ */
+ for (i = 0; i < strlen(debug_array_string); i++)
+ if (debug_array_string[i] == '\n')
+ cdm_element_count++;
+
+ if (!cdm_element_count) {
+ pr_info("No elements in client debug array string!\n");
+ goto out;
+ }
+
+ cdm_array =
+ kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
+ GFP_KERNEL);
+ if (!cdm_array) {
+ pr_info("malloc failed for cdm_array!\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ cds_head = debug_array_string;
+
+ for (i = 0; i < cdm_element_count; i++) {
+ cds_delimiter = strchr(cds_head, '\n');
+ *cds_delimiter = '\0';
+
+ keyword_len = strcspn(cds_head, " ");
+
+ cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
+ if (!cdm_array[i].keyword) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sscanf(cds_head,
+ "%s %llx %llx",
+ cdm_array[i].keyword,
+ (unsigned long long *)&(cdm_array[i].mask1),
+ (unsigned long long *)&(cdm_array[i].mask2));
+
+ if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
+ client_verbose_index = i;
+
+ if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
+ client_all_index = i;
+
+ cds_head = cds_delimiter + 1;
+ }
+
+ rc = cdm_element_count;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+
+out:
+
+ return rc;
+
+}
+
+/*
+ * /sys/kernel/debug/orangefs/debug-help can be catted to
+ * see all the available kernel and client debug keywords.
+ *
+ * When the kernel boots, we have no idea what keywords the
+ * client supports, nor their associated masks.
+ *
+ * We pass through this function once at boot and stamp a
+ * boilerplate "we don't know" message for the client in the
+ * debug-help file. We pass through here again when the client
+ * starts and then we can fill out the debug-help file fully.
+ *
+ * The client might be restarted any number of times between
+ * reboots, we only build the debug-help file the first time.
+ */
+int orangefs_prepare_debugfs_help_string(int at_boot)
+{
+ int rc = -EINVAL;
+ int i;
+ int byte_count = 0;
+ char *client_title = "Client Debug Keywords:\n";
+ char *kernel_title = "Kernel Debug Keywords:\n";
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (at_boot) {
+ byte_count += strlen(HELP_STRING_UNINITIALIZED);
+ client_title = HELP_STRING_UNINITIALIZED;
+ } else {
+ /*
+ * fill the client keyword/mask array and remember
+ * how many elements there were.
+ */
+ cdm_element_count =
+ orangefs_prepare_cdm_array(client_debug_array_string);
+ if (cdm_element_count <= 0)
+ goto out;
+
+ /* Count the bytes destined for debug_help_string. */
+ byte_count += strlen(client_title);
+
+ for (i = 0; i < cdm_element_count; i++) {
+ byte_count += strlen(cdm_array[i].keyword + 2);
+ if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+ pr_info("%s: overflow 1!\n", __func__);
+ goto out;
+ }
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: cdm_element_count:%d:\n",
+ __func__,
+ cdm_element_count);
+ }
+
+ byte_count += strlen(kernel_title);
+ for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+ byte_count +=
+ strlen(s_kmod_keyword_mask_map[i].keyword + 2);
+ if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+ pr_info("%s: overflow 2!\n", __func__);
+ goto out;
+ }
+ }
+
+ /* build debug_help_string. */
+ debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+ if (!debug_help_string) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ strcat(debug_help_string, client_title);
+
+ if (!at_boot) {
+ for (i = 0; i < cdm_element_count; i++) {
+ strcat(debug_help_string, "\t");
+ strcat(debug_help_string, cdm_array[i].keyword);
+ strcat(debug_help_string, "\n");
+ }
+ }
+
+ strcat(debug_help_string, "\n");
+ strcat(debug_help_string, kernel_title);
+
+ for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+ strcat(debug_help_string, "\t");
+ strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
+ strcat(debug_help_string, "\n");
+ }
+
+ rc = 0;
+
+out:
+
+ return rc;
+
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+void debug_mask_to_string(void *mask, int type)
+{
+ int i;
+ int len = 0;
+ char *debug_string;
+ int element_count = 0;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (type) {
+ debug_string = client_debug_string;
+ element_count = cdm_element_count;
+ } else {
+ debug_string = kernel_debug_string;
+ element_count = num_kmod_keyword_mask_map;
+ }
+
+ memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+
+ /*
+ * Some keywords, like "all" or "verbose", are amalgams of
+ * numerous other keywords. Make a special check for those
+ * before grinding through the whole mask only to find out
+ * later...
+ */
+ if (check_amalgam_keyword(mask, type))
+ goto out;
+
+ /* Build the debug string. */
+ for (i = 0; i < element_count; i++)
+ if (type)
+ do_c_string(mask, i);
+ else
+ do_k_string(mask, i);
+
+ len = strlen(debug_string);
+
+ if ((len) && (type))
+ client_debug_string[len - 1] = '\0';
+ else if (len)
+ kernel_debug_string[len - 1] = '\0';
+ else if (type)
+ strcpy(client_debug_string, "none");
+ else
+ strcpy(kernel_debug_string, "none");
+
+out:
+gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
+
+ return;
+
+}
+
+void do_k_string(void *k_mask, int index)
+{
+ __u64 *mask = (__u64 *) k_mask;
+
+ if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
+ goto out;
+
+ if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
+ if ((strlen(kernel_debug_string) +
+ strlen(s_kmod_keyword_mask_map[index].keyword))
+ < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+ strcat(kernel_debug_string,
+ s_kmod_keyword_mask_map[index].keyword);
+ strcat(kernel_debug_string, ",");
+ } else {
+ gossip_err("%s: overflow!\n", __func__);
+ strcpy(kernel_debug_string, ORANGEFS_ALL);
+ goto out;
+ }
+ }
+
+out:
+
+ return;
+}
+
+void do_c_string(void *c_mask, int index)
+{
+ struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
+
+ if (keyword_is_amalgam(cdm_array[index].keyword))
+ goto out;
+
+ if ((mask->mask1 & cdm_array[index].mask1) ||
+ (mask->mask2 & cdm_array[index].mask2)) {
+ if ((strlen(client_debug_string) +
+ strlen(cdm_array[index].keyword) + 1)
+ < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+ strcat(client_debug_string,
+ cdm_array[index].keyword);
+ strcat(client_debug_string, ",");
+ } else {
+ gossip_err("%s: overflow!\n", __func__);
+ strcpy(client_debug_string, ORANGEFS_ALL);
+ goto out;
+ }
+ }
+out:
+ return;
+}
+
+int keyword_is_amalgam(char *keyword)
+{
+ int rc = 0;
+
+ if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
+ rc = 1;
+
+ return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ *
+ * return 1 if we found an amalgam.
+ */
+int check_amalgam_keyword(void *mask, int type)
+{
+ __u64 *k_mask;
+ struct client_debug_mask *c_mask;
+ int k_all_index = num_kmod_keyword_mask_map - 1;
+ int rc = 0;
+
+ if (type) {
+ c_mask = (struct client_debug_mask *) mask;
+
+ if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
+ (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
+ strcpy(client_debug_string, ORANGEFS_ALL);
+ rc = 1;
+ goto out;
+ }
+
+ if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
+ (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
+ strcpy(client_debug_string, ORANGEFS_VERBOSE);
+ rc = 1;
+ goto out;
+ }
+
+ } else {
+ k_mask = (__u64 *) mask;
+
+ if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
+ strcpy(kernel_debug_string, ORANGEFS_ALL);
+ rc = 1;
+ goto out;
+ }
+ }
+
+out:
+
+ return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+void debug_string_to_mask(char *debug_string, void *mask, int type)
+{
+ char *unchecked_keyword;
+ int i;
+ char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
+ char *original_pointer;
+ int element_count = 0;
+ struct client_debug_mask *c_mask;
+ __u64 *k_mask;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (type) {
+ c_mask = (struct client_debug_mask *)mask;
+ element_count = cdm_element_count;
+ } else {
+ k_mask = (__u64 *)mask;
+ *k_mask = 0;
+ element_count = num_kmod_keyword_mask_map;
+ }
+
+ original_pointer = strsep_fodder;
+ while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
+ if (strlen(unchecked_keyword)) {
+ for (i = 0; i < element_count; i++)
+ if (type)
+ do_c_mask(i,
+ unchecked_keyword,
+ &c_mask);
+ else
+ do_k_mask(i,
+ unchecked_keyword,
+ &k_mask);
+ }
+
+ kfree(original_pointer);
+}
+
+void do_c_mask(int i,
+ char *unchecked_keyword,
+ struct client_debug_mask **sane_mask)
+{
+
+ if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
+ (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
+ (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
+ }
+}
+
+void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
+{
+
+ if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
+ **sane_mask = (**sane_mask) |
+ s_kmod_keyword_mask_map[i].mask_val;
+}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
new file mode 100644
index 000000000000..50578a28bd9e
--- /dev/null
+++ b/fs/orangefs/protocol.h
@@ -0,0 +1,452 @@
+#include <linux/types.h>
+#include <linux/spinlock_types.h>
+#include <linux/slab.h>
+#include <linux/ioctl.h>
+
+extern struct client_debug_mask *cdm_array;
+extern char *debug_help_string;
+extern int help_string_initialized;
+extern struct dentry *debug_dir;
+extern struct dentry *help_file_dentry;
+extern struct dentry *client_debug_dentry;
+extern const struct file_operations debug_help_fops;
+extern int client_all_index;
+extern int client_verbose_index;
+extern int cdm_element_count;
+#define DEBUG_HELP_STRING_SIZE 4096
+#define HELP_STRING_UNINITIALIZED \
+ "Client Debug Keywords are unknown until the first time\n" \
+ "the client is started after boot.\n"
+#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
+#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
+#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
+#define ORANGEFS_VERBOSE "verbose"
+#define ORANGEFS_ALL "all"
+
+/* pvfs2-config.h ***********************************************************/
+#define ORANGEFS_VERSION_MAJOR 2
+#define ORANGEFS_VERSION_MINOR 9
+#define ORANGEFS_VERSION_SUB 0
+
+/* khandle stuff ***********************************************************/
+
+/*
+ * The 2.9 core will put 64 bit handles in here like this:
+ * 1234 0000 0000 5678
+ * The 3.0 and beyond cores will put 128 bit handles in here like this:
+ * 1234 5678 90AB CDEF
+ * The kernel module will always use the first four bytes and
+ * the last four bytes as an inum.
+ */
+struct orangefs_khandle {
+ unsigned char u[16];
+} __aligned(8);
+
+/*
+ * kernel version of an object ref.
+ */
+struct orangefs_object_kref {
+ struct orangefs_khandle khandle;
+ __s32 fs_id;
+ __s32 __pad1;
+};
+
+/*
+ * compare 2 khandles assumes little endian thus from large address to
+ * small address
+ */
+static inline int ORANGEFS_khandle_cmp(const struct orangefs_khandle *kh1,
+ const struct orangefs_khandle *kh2)
+{
+ int i;
+
+ for (i = 15; i >= 0; i--) {
+ if (kh1->u[i] > kh2->u[i])
+ return 1;
+ if (kh1->u[i] < kh2->u[i])
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh,
+ void *p, int size)
+{
+
+ memset(p, 0, size);
+ memcpy(p, kh->u, 16);
+
+}
+
+static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh,
+ void *p, int size)
+{
+ memset(kh, 0, 16);
+ memcpy(kh->u, p, 16);
+
+}
+
+/* pvfs2-types.h ************************************************************/
+typedef __u32 ORANGEFS_uid;
+typedef __u32 ORANGEFS_gid;
+typedef __s32 ORANGEFS_fs_id;
+typedef __u32 ORANGEFS_permissions;
+typedef __u64 ORANGEFS_time;
+typedef __s64 ORANGEFS_size;
+typedef __u64 ORANGEFS_flags;
+typedef __u64 ORANGEFS_ds_position;
+typedef __s32 ORANGEFS_error;
+typedef __s64 ORANGEFS_offset;
+
+#define ORANGEFS_SUPER_MAGIC 0x20030528
+
+/*
+ * ORANGEFS error codes are a signed 32-bit integer. Error codes are negative, but
+ * the sign is stripped before decoding.
+ */
+
+/* Bit 31 is not used since it is the sign. */
+
+/*
+ * Bit 30 specifies that this is a ORANGEFS error. A ORANGEFS error is either an
+ * encoded errno value or a ORANGEFS protocol error.
+ */
+#define ORANGEFS_ERROR_BIT (1 << 30)
+
+/*
+ * Bit 29 specifies that this is a ORANGEFS protocol error and not an encoded
+ * errno value.
+ */
+#define ORANGEFS_NON_ERRNO_ERROR_BIT (1 << 29)
+
+/*
+ * Bits 9, 8, and 7 specify the error class, which encodes the section of
+ * server code the error originated in for logging purposes. It is not used
+ * in the kernel except to be masked out.
+ */
+#define ORANGEFS_ERROR_CLASS_BITS 0x380
+
+/* Bits 6 - 0 are reserved for the actual error code. */
+#define ORANGEFS_ERROR_NUMBER_BITS 0x7f
+
+/* Encoded errno values decoded by PINT_errno_mapping in orangefs-utils.c. */
+
+/* Our own ORANGEFS protocol error codes. */
+#define ORANGEFS_ECANCEL (1|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EDEVINIT (2|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EDETAIL (3|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EHOSTNTFD (4|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EADDRNTFD (5|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ENORECVR (6|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ETRYAGAIN (7|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ENOTPVFS (8|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ESECURITY (9|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+
+/* permission bits */
+#define ORANGEFS_O_EXECUTE (1 << 0)
+#define ORANGEFS_O_WRITE (1 << 1)
+#define ORANGEFS_O_READ (1 << 2)
+#define ORANGEFS_G_EXECUTE (1 << 3)
+#define ORANGEFS_G_WRITE (1 << 4)
+#define ORANGEFS_G_READ (1 << 5)
+#define ORANGEFS_U_EXECUTE (1 << 6)
+#define ORANGEFS_U_WRITE (1 << 7)
+#define ORANGEFS_U_READ (1 << 8)
+/* no ORANGEFS_U_VTX (sticky bit) */
+#define ORANGEFS_G_SGID (1 << 10)
+#define ORANGEFS_U_SUID (1 << 11)
+
+/* definition taken from stdint.h */
+#define INT32_MAX (2147483647)
+#define ORANGEFS_ITERATE_START (INT32_MAX - 1)
+#define ORANGEFS_ITERATE_END (INT32_MAX - 2)
+#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3)
+#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
+#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END
+#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
+#define ORANGEFS_APPEND_FL FS_APPEND_FL
+#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
+#define ORANGEFS_MIRROR_FL 0x01000000ULL
+#define ORANGEFS_O_EXECUTE (1 << 0)
+#define ORANGEFS_FS_ID_NULL ((__s32)0)
+#define ORANGEFS_ATTR_SYS_UID (1 << 0)
+#define ORANGEFS_ATTR_SYS_GID (1 << 1)
+#define ORANGEFS_ATTR_SYS_PERM (1 << 2)
+#define ORANGEFS_ATTR_SYS_ATIME (1 << 3)
+#define ORANGEFS_ATTR_SYS_CTIME (1 << 4)
+#define ORANGEFS_ATTR_SYS_MTIME (1 << 5)
+#define ORANGEFS_ATTR_SYS_TYPE (1 << 6)
+#define ORANGEFS_ATTR_SYS_ATIME_SET (1 << 7)
+#define ORANGEFS_ATTR_SYS_MTIME_SET (1 << 8)
+#define ORANGEFS_ATTR_SYS_SIZE (1 << 20)
+#define ORANGEFS_ATTR_SYS_LNK_TARGET (1 << 24)
+#define ORANGEFS_ATTR_SYS_DFILE_COUNT (1 << 25)
+#define ORANGEFS_ATTR_SYS_DIRENT_COUNT (1 << 26)
+#define ORANGEFS_ATTR_SYS_BLKSIZE (1 << 28)
+#define ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT (1 << 29)
+#define ORANGEFS_ATTR_SYS_COMMON_ALL \
+ (ORANGEFS_ATTR_SYS_UID | \
+ ORANGEFS_ATTR_SYS_GID | \
+ ORANGEFS_ATTR_SYS_PERM | \
+ ORANGEFS_ATTR_SYS_ATIME | \
+ ORANGEFS_ATTR_SYS_CTIME | \
+ ORANGEFS_ATTR_SYS_MTIME | \
+ ORANGEFS_ATTR_SYS_TYPE)
+
+#define ORANGEFS_ATTR_SYS_ALL_SETABLE \
+(ORANGEFS_ATTR_SYS_COMMON_ALL-ORANGEFS_ATTR_SYS_TYPE)
+
+#define ORANGEFS_ATTR_SYS_ALL_NOHINT \
+ (ORANGEFS_ATTR_SYS_COMMON_ALL | \
+ ORANGEFS_ATTR_SYS_SIZE | \
+ ORANGEFS_ATTR_SYS_LNK_TARGET | \
+ ORANGEFS_ATTR_SYS_DFILE_COUNT | \
+ ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
+ ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
+ ORANGEFS_ATTR_SYS_BLKSIZE)
+
+#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
+ (ORANGEFS_ATTR_SYS_COMMON_ALL | \
+ ORANGEFS_ATTR_SYS_LNK_TARGET | \
+ ORANGEFS_ATTR_SYS_DFILE_COUNT | \
+ ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
+ ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
+ ORANGEFS_ATTR_SYS_BLKSIZE)
+
+#define ORANGEFS_XATTR_REPLACE 0x2
+#define ORANGEFS_XATTR_CREATE 0x1
+#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
+#define ORANGEFS_NAME_MAX 256
+/*
+ * max extended attribute name len as imposed by the VFS and exploited for the
+ * upcall request types.
+ * NOTE: Please retain them as multiples of 8 even if you wish to change them
+ * This is *NECESSARY* for supporting 32 bit user-space binaries on a 64-bit
+ * kernel. Due to implementation within DBPF, this really needs to be
+ * ORANGEFS_NAME_MAX, which it was the same value as, but no reason to let it
+ * break if that changes in the future.
+ */
+#define ORANGEFS_MAX_XATTR_NAMELEN ORANGEFS_NAME_MAX /* Not the same as
+ * XATTR_NAME_MAX defined
+ * by <linux/xattr.h>
+ */
+#define ORANGEFS_MAX_XATTR_VALUELEN 8192 /* Not the same as XATTR_SIZE_MAX
+ * defined by <linux/xattr.h>
+ */
+#define ORANGEFS_MAX_XATTR_LISTLEN 16 /* Not the same as XATTR_LIST_MAX
+ * defined by <linux/xattr.h>
+ */
+/*
+ * ORANGEFS I/O operation types, used in both system and server interfaces.
+ */
+enum ORANGEFS_io_type {
+ ORANGEFS_IO_READ = 1,
+ ORANGEFS_IO_WRITE = 2
+};
+
+/*
+ * If this enum is modified the server parameters related to the precreate pool
+ * batch and low threshold sizes may need to be modified to reflect this
+ * change.
+ */
+enum orangefs_ds_type {
+ ORANGEFS_TYPE_NONE = 0,
+ ORANGEFS_TYPE_METAFILE = (1 << 0),
+ ORANGEFS_TYPE_DATAFILE = (1 << 1),
+ ORANGEFS_TYPE_DIRECTORY = (1 << 2),
+ ORANGEFS_TYPE_SYMLINK = (1 << 3),
+ ORANGEFS_TYPE_DIRDATA = (1 << 4),
+ ORANGEFS_TYPE_INTERNAL = (1 << 5) /* for the server's private use */
+};
+
+/*
+ * ORANGEFS_certificate simply stores a buffer with the buffer size.
+ * The buffer can be converted to an OpenSSL X509 struct for use.
+ */
+struct ORANGEFS_certificate {
+ __u32 buf_size;
+ unsigned char *buf;
+};
+
+/*
+ * A credential identifies a user and is signed by the client/user
+ * private key.
+ */
+struct ORANGEFS_credential {
+ __u32 userid; /* user id */
+ __u32 num_groups; /* length of group_array */
+ __u32 *group_array; /* groups for which the user is a member */
+ char *issuer; /* alias of the issuing server */
+ __u64 timeout; /* seconds after epoch to time out */
+ __u32 sig_size; /* length of the signature in bytes */
+ unsigned char *signature; /* digital signature */
+ struct ORANGEFS_certificate certificate; /* user certificate buffer */
+};
+#define extra_size_ORANGEFS_credential (ORANGEFS_REQ_LIMIT_GROUPS * \
+ sizeof(__u32) + \
+ ORANGEFS_REQ_LIMIT_ISSUER + \
+ ORANGEFS_REQ_LIMIT_SIGNATURE + \
+ extra_size_ORANGEFS_certificate)
+
+/* This structure is used by the VFS-client interaction alone */
+struct ORANGEFS_keyval_pair {
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+ __s32 key_sz; /* __s32 for portable, fixed-size structures */
+ __s32 val_sz;
+ char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+
+/* pvfs2-sysint.h ***********************************************************/
+/* Describes attributes for a file, directory, or symlink. */
+struct ORANGEFS_sys_attr_s {
+ __u32 owner;
+ __u32 group;
+ __u32 perms;
+ __u64 atime;
+ __u64 mtime;
+ __u64 ctime;
+ __s64 size;
+
+ /* NOTE: caller must free if valid */
+ char *link_target;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 dfile_count;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 distr_dir_servers_initial;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 distr_dir_servers_max;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 distr_dir_split_size;
+
+ __u32 mirror_copies_count;
+
+ /* NOTE: caller must free if valid */
+ char *dist_name;
+
+ /* NOTE: caller must free if valid */
+ char *dist_params;
+
+ __s64 dirent_count;
+ enum orangefs_ds_type objtype;
+ __u64 flags;
+ __u32 mask;
+ __s64 blksize;
+};
+
+#define ORANGEFS_LOOKUP_LINK_NO_FOLLOW 0
+
+/* pint-dev.h ***************************************************************/
+
+/* parameter structure used in ORANGEFS_DEV_DEBUG ioctl command */
+struct dev_mask_info_s {
+ enum {
+ KERNEL_MASK,
+ CLIENT_MASK,
+ } mask_type;
+ __u64 mask_value;
+};
+
+struct dev_mask2_info_s {
+ __u64 mask1_value;
+ __u64 mask2_value;
+};
+
+/* pvfs2-util.h *************************************************************/
+__s32 ORANGEFS_util_translate_mode(int mode);
+
+/* pvfs2-debug.h ************************************************************/
+#include "orangefs-debug.h"
+
+/* pvfs2-internal.h *********************************************************/
+#define llu(x) (unsigned long long)(x)
+#define lld(x) (long long)(x)
+
+/* pint-dev-shared.h ********************************************************/
+#define ORANGEFS_DEV_MAGIC 'k'
+
+#define ORANGEFS_READDIR_DEFAULT_DESC_COUNT 5
+
+#define DEV_GET_MAGIC 0x1
+#define DEV_GET_MAX_UPSIZE 0x2
+#define DEV_GET_MAX_DOWNSIZE 0x3
+#define DEV_MAP 0x4
+#define DEV_REMOUNT_ALL 0x5
+#define DEV_DEBUG 0x6
+#define DEV_UPSTREAM 0x7
+#define DEV_CLIENT_MASK 0x8
+#define DEV_CLIENT_STRING 0x9
+#define DEV_MAX_NR 0xa
+
+/* supported ioctls, codes are with respect to user-space */
+enum {
+ ORANGEFS_DEV_GET_MAGIC = _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAGIC, __s32),
+ ORANGEFS_DEV_GET_MAX_UPSIZE =
+ _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_UPSIZE, __s32),
+ ORANGEFS_DEV_GET_MAX_DOWNSIZE =
+ _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_DOWNSIZE, __s32),
+ ORANGEFS_DEV_MAP = _IO(ORANGEFS_DEV_MAGIC, DEV_MAP),
+ ORANGEFS_DEV_REMOUNT_ALL = _IO(ORANGEFS_DEV_MAGIC, DEV_REMOUNT_ALL),
+ ORANGEFS_DEV_DEBUG = _IOR(ORANGEFS_DEV_MAGIC, DEV_DEBUG, __s32),
+ ORANGEFS_DEV_UPSTREAM = _IOW(ORANGEFS_DEV_MAGIC, DEV_UPSTREAM, int),
+ ORANGEFS_DEV_CLIENT_MASK = _IOW(ORANGEFS_DEV_MAGIC,
+ DEV_CLIENT_MASK,
+ struct dev_mask2_info_s),
+ ORANGEFS_DEV_CLIENT_STRING = _IOW(ORANGEFS_DEV_MAGIC,
+ DEV_CLIENT_STRING,
+ char *),
+ ORANGEFS_DEV_MAXNR = DEV_MAX_NR,
+};
+
+/*
+ * version number for use in communicating between kernel space and user
+ * space. Zero signifies the upstream version of the kernel module.
+ */
+#define ORANGEFS_KERNEL_PROTO_VERSION 0
+#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20903
+
+/*
+ * describes memory regions to map in the ORANGEFS_DEV_MAP ioctl.
+ * NOTE: See devorangefs-req.c for 32 bit compat structure.
+ * Since this structure has a variable-sized layout that is different
+ * on 32 and 64 bit platforms, we need to normalize to a 64 bit layout
+ * on such systems before servicing ioctl calls from user-space binaries
+ * that may be 32 bit!
+ */
+struct ORANGEFS_dev_map_desc {
+ void *ptr;
+ __s32 total_size;
+ __s32 size;
+ __s32 count;
+};
+
+/* gossip.h *****************************************************************/
+
+#ifdef GOSSIP_DISABLE_DEBUG
+#define gossip_debug(mask, format, f...) do {} while (0)
+#else
+extern __u64 gossip_debug_mask;
+extern struct client_debug_mask client_debug_mask;
+
+/* try to avoid function call overhead by checking masks in macro */
+#define gossip_debug(mask, format, f...) \
+do { \
+ if (gossip_debug_mask & mask) \
+ printk(format, ##f); \
+} while (0)
+#endif /* GOSSIP_DISABLE_DEBUG */
+
+/* do file and line number printouts w/ the GNU preprocessor */
+#define gossip_ldebug(mask, format, f...) \
+ gossip_debug(mask, "%s: " format, __func__, ##f)
+
+#define gossip_err printk
+#define gossip_lerr(format, f...) \
+ gossip_err("%s line %d: " format, \
+ __FILE__, \
+ __LINE__, \
+ ##f)
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
new file mode 100644
index 000000000000..b9da9a0281c9
--- /dev/null
+++ b/fs/orangefs/super.c
@@ -0,0 +1,559 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+#include <linux/parser.h>
+
+/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
+static struct kmem_cache *orangefs_inode_cache;
+
+/* list for storing orangefs specific superblocks in use */
+LIST_HEAD(orangefs_superblocks);
+
+DEFINE_SPINLOCK(orangefs_superblocks_lock);
+
+enum {
+ Opt_intr,
+ Opt_acl,
+ Opt_local_lock,
+
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ { Opt_acl, "acl" },
+ { Opt_intr, "intr" },
+ { Opt_local_lock, "local_lock" },
+ { Opt_err, NULL }
+};
+
+
+static int parse_mount_options(struct super_block *sb, char *options,
+ int silent)
+{
+ struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+
+ /*
+ * Force any potential flags that might be set from the mount
+ * to zero, ie, initialize to unset.
+ */
+ sb->s_flags &= ~MS_POSIXACL;
+ orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
+ orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_acl:
+ sb->s_flags |= MS_POSIXACL;
+ break;
+ case Opt_intr:
+ orangefs_sb->flags |= ORANGEFS_OPT_INTR;
+ break;
+ case Opt_local_lock:
+ orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
+ break;
+ default:
+ goto fail;
+ }
+ }
+
+ return 0;
+fail:
+ if (!silent)
+ gossip_err("Error: mount option [%s] is not supported.\n", p);
+ return -EINVAL;
+}
+
+static void orangefs_inode_cache_ctor(void *req)
+{
+ struct orangefs_inode_s *orangefs_inode = req;
+
+ inode_init_once(&orangefs_inode->vfs_inode);
+ init_rwsem(&orangefs_inode->xattr_sem);
+
+ orangefs_inode->vfs_inode.i_version = 1;
+}
+
+static struct inode *orangefs_alloc_inode(struct super_block *sb)
+{
+ struct orangefs_inode_s *orangefs_inode;
+
+ orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
+ if (orangefs_inode == NULL) {
+ gossip_err("Failed to allocate orangefs_inode\n");
+ return NULL;
+ }
+
+ /*
+ * We want to clear everything except for rw_semaphore and the
+ * vfs_inode.
+ */
+ memset(&orangefs_inode->refn.khandle, 0, 16);
+ orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL;
+ orangefs_inode->last_failed_block_index_read = 0;
+ memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target));
+ orangefs_inode->pinode_flags = 0;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_alloc_inode: allocated %p\n",
+ &orangefs_inode->vfs_inode);
+ return &orangefs_inode->vfs_inode;
+}
+
+static void orangefs_destroy_inode(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "%s: deallocated %p destroying inode %pU\n",
+ __func__, orangefs_inode, get_khandle_from_ino(inode));
+
+ kmem_cache_free(orangefs_inode_cache, orangefs_inode);
+}
+
+/*
+ * NOTE: information filled in here is typically reflected in the
+ * output of the system command 'df'
+*/
+static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int ret = -ENOMEM;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int flags = 0;
+ struct super_block *sb = NULL;
+
+ sb = dentry->d_sb;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_statfs: called on sb %p (fs_id is %d)\n",
+ sb,
+ (int)(ORANGEFS_SB(sb)->fs_id));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_STATFS);
+ if (!new_op)
+ return ret;
+ new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id;
+
+ if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR)
+ flags = ORANGEFS_OP_INTERRUPTIBLE;
+
+ ret = service_operation(new_op, "orangefs_statfs", flags);
+
+ if (new_op->downcall.status < 0)
+ goto out_op_release;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "%s: got %ld blocks available | "
+ "%ld blocks total | %ld block size | "
+ "%ld files total | %ld files avail\n",
+ __func__,
+ (long)new_op->downcall.resp.statfs.blocks_avail,
+ (long)new_op->downcall.resp.statfs.blocks_total,
+ (long)new_op->downcall.resp.statfs.block_size,
+ (long)new_op->downcall.resp.statfs.files_total,
+ (long)new_op->downcall.resp.statfs.files_avail);
+
+ buf->f_type = sb->s_magic;
+ memcpy(&buf->f_fsid, &ORANGEFS_SB(sb)->fs_id, sizeof(buf->f_fsid));
+ buf->f_bsize = new_op->downcall.resp.statfs.block_size;
+ buf->f_namelen = ORANGEFS_NAME_MAX;
+
+ buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total;
+ buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+ buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+ buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
+ buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
+ buf->f_frsize = sb->s_blocksize;
+
+out_op_release:
+ op_release(new_op);
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret);
+ return ret;
+}
+
+/*
+ * Remount as initiated by VFS layer. We just need to reparse the mount
+ * options, no need to signal pvfs2-client-core about it.
+ */
+static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n");
+ return parse_mount_options(sb, data, 1);
+}
+
+/*
+ * Remount as initiated by pvfs2-client-core on restart. This is used to
+ * repopulate mount information left from previous pvfs2-client-core.
+ *
+ * the idea here is that given a valid superblock, we're
+ * re-initializing the user space client with the initial mount
+ * information specified when the super block was first initialized.
+ * this is very different than the first initialization/creation of a
+ * superblock. we use the special service_priority_operation to make
+ * sure that the mount gets ahead of any other pending operation that
+ * is waiting for servicing. this means that the pvfs2-client won't
+ * fail to start several times for all other pending operations before
+ * the client regains all of the mount information from us.
+ * NOTE: this function assumes that the request_mutex is already acquired!
+ */
+int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
+{
+ struct orangefs_kernel_op_s *new_op;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n");
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
+ if (!new_op)
+ return -ENOMEM;
+ strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+ orangefs_sb->devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Attempting ORANGEFS Remount via host %s\n",
+ new_op->upcall.req.fs_mount.orangefs_config_server);
+
+ /*
+ * we assume that the calling function has already acquired the
+ * request_mutex to prevent other operations from bypassing
+ * this one
+ */
+ ret = service_operation(new_op, "orangefs_remount",
+ ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX);
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_remount: mount got return value of %d\n",
+ ret);
+ if (ret == 0) {
+ /*
+ * store the id assigned to this sb -- it's just a
+ * short-lived mapping that the system interface uses
+ * to map this superblock to a particular mount entry
+ */
+ orangefs_sb->id = new_op->downcall.resp.fs_mount.id;
+ orangefs_sb->mount_pending = 0;
+ }
+
+ op_release(new_op);
+ return ret;
+}
+
+int fsid_key_table_initialize(void)
+{
+ return 0;
+}
+
+void fsid_key_table_finalize(void)
+{
+}
+
+/* Called whenever the VFS dirties the inode in response to atime updates */
+static void orangefs_dirty_inode(struct inode *inode, int flags)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_dirty_inode: %pU\n",
+ get_khandle_from_ino(inode));
+ SetAtimeFlag(orangefs_inode);
+}
+
+static const struct super_operations orangefs_s_ops = {
+ .alloc_inode = orangefs_alloc_inode,
+ .destroy_inode = orangefs_destroy_inode,
+ .dirty_inode = orangefs_dirty_inode,
+ .drop_inode = generic_delete_inode,
+ .statfs = orangefs_statfs,
+ .remount_fs = orangefs_remount_fs,
+ .show_options = generic_show_options,
+};
+
+static struct dentry *orangefs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len,
+ int fh_type)
+{
+ struct orangefs_object_kref refn;
+
+ if (fh_len < 5 || fh_type > 2)
+ return NULL;
+
+ ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16);
+ refn.fs_id = (u32) fid->raw[4];
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "fh_to_dentry: handle %pU, fs_id %d\n",
+ &refn.khandle,
+ refn.fs_id);
+
+ return d_obtain_alias(orangefs_iget(sb, &refn));
+}
+
+static int orangefs_encode_fh(struct inode *inode,
+ __u32 *fh,
+ int *max_len,
+ struct inode *parent)
+{
+ int len = parent ? 10 : 5;
+ int type = 1;
+ struct orangefs_object_kref refn;
+
+ if (*max_len < len) {
+ gossip_lerr("fh buffer is too small for encoding\n");
+ *max_len = len;
+ type = 255;
+ goto out;
+ }
+
+ refn = ORANGEFS_I(inode)->refn;
+ ORANGEFS_khandle_to(&refn.khandle, fh, 16);
+ fh[4] = refn.fs_id;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Encoding fh: handle %pU, fsid %u\n",
+ &refn.khandle,
+ refn.fs_id);
+
+
+ if (parent) {
+ refn = ORANGEFS_I(parent)->refn;
+ ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16);
+ fh[9] = refn.fs_id;
+
+ type = 2;
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Encoding parent: handle %pU, fsid %u\n",
+ &refn.khandle,
+ refn.fs_id);
+ }
+ *max_len = len;
+
+out:
+ return type;
+}
+
+static const struct export_operations orangefs_export_ops = {
+ .encode_fh = orangefs_encode_fh,
+ .fh_to_dentry = orangefs_fh_to_dentry,
+};
+
+static int orangefs_fill_sb(struct super_block *sb,
+ struct orangefs_fs_mount_response *fs_mount,
+ void *data, int silent)
+{
+ int ret = -EINVAL;
+ struct inode *root = NULL;
+ struct dentry *root_dentry = NULL;
+ struct orangefs_object_kref root_object;
+
+ /* alloc and init our private orangefs sb info */
+ sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+ if (!ORANGEFS_SB(sb))
+ return -ENOMEM;
+ ORANGEFS_SB(sb)->sb = sb;
+
+ ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle;
+ ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id;
+ ORANGEFS_SB(sb)->id = fs_mount->id;
+
+ if (data) {
+ ret = parse_mount_options(sb, data, silent);
+ if (ret)
+ return ret;
+ }
+
+ /* Hang the xattr handlers off the superblock */
+ sb->s_xattr = orangefs_xattr_handlers;
+ sb->s_magic = ORANGEFS_SUPER_MAGIC;
+ sb->s_op = &orangefs_s_ops;
+ sb->s_d_op = &orangefs_dentry_operations;
+
+ sb->s_blocksize = orangefs_bufmap_size_query();
+ sb->s_blocksize_bits = orangefs_bufmap_shift_query();
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+ root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
+ root_object.fs_id = ORANGEFS_SB(sb)->fs_id;
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "get inode %pU, fsid %d\n",
+ &root_object.khandle,
+ root_object.fs_id);
+
+ root = orangefs_iget(sb, &root_object);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Allocated root inode [%p] with mode %x\n",
+ root,
+ root->i_mode);
+
+ /* allocates and places root dentry in dcache */
+ root_dentry = d_make_root(root);
+ if (!root_dentry)
+ return -ENOMEM;
+
+ sb->s_export_op = &orangefs_export_ops;
+ sb->s_root = root_dentry;
+ return 0;
+}
+
+struct dentry *orangefs_mount(struct file_system_type *fst,
+ int flags,
+ const char *devname,
+ void *data)
+{
+ int ret = -EINVAL;
+ struct super_block *sb = ERR_PTR(-EINVAL);
+ struct orangefs_kernel_op_s *new_op;
+ struct dentry *d = ERR_PTR(-EINVAL);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_mount: called with devname %s\n",
+ devname);
+
+ if (!devname) {
+ gossip_err("ERROR: device name not specified.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+
+ strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+ devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Attempting ORANGEFS Mount via host %s\n",
+ new_op->upcall.req.fs_mount.orangefs_config_server);
+
+ ret = service_operation(new_op, "orangefs_mount", 0);
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_mount: mount got return value of %d\n", ret);
+ if (ret)
+ goto free_op;
+
+ if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) {
+ gossip_err("ERROR: Retrieved null fs_id\n");
+ ret = -EINVAL;
+ goto free_op;
+ }
+
+ sb = sget(fst, NULL, set_anon_super, flags, NULL);
+
+ if (IS_ERR(sb)) {
+ d = ERR_CAST(sb);
+ goto free_op;
+ }
+
+ ret = orangefs_fill_sb(sb,
+ &new_op->downcall.resp.fs_mount, data,
+ flags & MS_SILENT ? 1 : 0);
+
+ if (ret) {
+ d = ERR_PTR(ret);
+ goto free_op;
+ }
+
+ /*
+ * on successful mount, store the devname and data
+ * used
+ */
+ strncpy(ORANGEFS_SB(sb)->devname,
+ devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ /* mount_pending must be cleared */
+ ORANGEFS_SB(sb)->mount_pending = 0;
+
+ /*
+ * finally, add this sb to our list of known orangefs
+ * sb's
+ */
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Adding SB %p to orangefs superblocks\n",
+ ORANGEFS_SB(sb));
+ spin_lock(&orangefs_superblocks_lock);
+ list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
+ spin_unlock(&orangefs_superblocks_lock);
+ op_release(new_op);
+ return dget(sb->s_root);
+
+free_op:
+ gossip_err("orangefs_mount: mount request failed with %d\n", ret);
+ if (ret == -EINVAL) {
+ gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n");
+ gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n");
+ }
+
+ op_release(new_op);
+
+ return d;
+}
+
+void orangefs_kill_sb(struct super_block *sb)
+{
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
+
+ /* provided sb cleanup */
+ kill_anon_super(sb);
+
+ /*
+ * issue the unmount to userspace to tell it to remove the
+ * dynamic mount info it has for this superblock
+ */
+ orangefs_unmount_sb(sb);
+
+ /* remove the sb from our list of orangefs specific sb's */
+
+ spin_lock(&orangefs_superblocks_lock);
+ __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */
+ ORANGEFS_SB(sb)->list.prev = NULL;
+ spin_unlock(&orangefs_superblocks_lock);
+
+ /*
+ * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
+ * gets completed before we free the dang thing.
+ */
+ mutex_lock(&request_mutex);
+ mutex_unlock(&request_mutex);
+
+ /* free the orangefs superblock private data */
+ kfree(ORANGEFS_SB(sb));
+}
+
+int orangefs_inode_cache_initialize(void)
+{
+ orangefs_inode_cache = kmem_cache_create("orangefs_inode_cache",
+ sizeof(struct orangefs_inode_s),
+ 0,
+ ORANGEFS_CACHE_CREATE_FLAGS,
+ orangefs_inode_cache_ctor);
+
+ if (!orangefs_inode_cache) {
+ gossip_err("Cannot create orangefs_inode_cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int orangefs_inode_cache_finalize(void)
+{
+ kmem_cache_destroy(orangefs_inode_cache);
+ return 0;
+}
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
new file mode 100644
index 000000000000..6418dd638680
--- /dev/null
+++ b/fs/orangefs/symlink.c
@@ -0,0 +1,19 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+struct inode_operations orangefs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .get_link = simple_get_link,
+ .setattr = orangefs_setattr,
+ .getattr = orangefs_getattr,
+ .listxattr = orangefs_listxattr,
+ .setxattr = generic_setxattr,
+ .permission = orangefs_permission,
+};
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
new file mode 100644
index 000000000000..001b20239407
--- /dev/null
+++ b/fs/orangefs/upcall.h
@@ -0,0 +1,246 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef __UPCALL_H
+#define __UPCALL_H
+
+/*
+ * Sanitized this header file to fix
+ * 32-64 bit interaction issues between
+ * client-core and device
+ */
+struct orangefs_io_request_s {
+ __s32 __pad1;
+ __s32 buf_index;
+ __s32 count;
+ __s32 __pad2;
+ __s64 offset;
+ struct orangefs_object_kref refn;
+ enum ORANGEFS_io_type io_type;
+ __s32 readahead_size;
+};
+
+struct orangefs_lookup_request_s {
+ __s32 sym_follow;
+ __s32 __pad1;
+ struct orangefs_object_kref parent_refn;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_create_request_s {
+ struct orangefs_object_kref parent_refn;
+ struct ORANGEFS_sys_attr_s attributes;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_symlink_request_s {
+ struct orangefs_object_kref parent_refn;
+ struct ORANGEFS_sys_attr_s attributes;
+ char entry_name[ORANGEFS_NAME_MAX];
+ char target[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_getattr_request_s {
+ struct orangefs_object_kref refn;
+ __u32 mask;
+ __u32 __pad1;
+};
+
+struct orangefs_setattr_request_s {
+ struct orangefs_object_kref refn;
+ struct ORANGEFS_sys_attr_s attributes;
+};
+
+struct orangefs_remove_request_s {
+ struct orangefs_object_kref parent_refn;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_mkdir_request_s {
+ struct orangefs_object_kref parent_refn;
+ struct ORANGEFS_sys_attr_s attributes;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_readdir_request_s {
+ struct orangefs_object_kref refn;
+ __u64 token;
+ __s32 max_dirent_count;
+ __s32 buf_index;
+};
+
+struct orangefs_readdirplus_request_s {
+ struct orangefs_object_kref refn;
+ __u64 token;
+ __s32 max_dirent_count;
+ __u32 mask;
+ __s32 buf_index;
+ __s32 __pad1;
+};
+
+struct orangefs_rename_request_s {
+ struct orangefs_object_kref old_parent_refn;
+ struct orangefs_object_kref new_parent_refn;
+ char d_old_name[ORANGEFS_NAME_MAX];
+ char d_new_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_statfs_request_s {
+ __s32 fs_id;
+ __s32 __pad1;
+};
+
+struct orangefs_truncate_request_s {
+ struct orangefs_object_kref refn;
+ __s64 size;
+};
+
+struct orangefs_mmap_ra_cache_flush_request_s {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_fs_mount_request_s {
+ char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
+};
+
+struct orangefs_fs_umount_request_s {
+ __s32 id;
+ __s32 fs_id;
+ char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
+};
+
+struct orangefs_getxattr_request_s {
+ struct orangefs_object_kref refn;
+ __s32 key_sz;
+ __s32 __pad1;
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+};
+
+struct orangefs_setxattr_request_s {
+ struct orangefs_object_kref refn;
+ struct ORANGEFS_keyval_pair keyval;
+ __s32 flags;
+ __s32 __pad1;
+};
+
+struct orangefs_listxattr_request_s {
+ struct orangefs_object_kref refn;
+ __s32 requested_count;
+ __s32 __pad1;
+ __u64 token;
+};
+
+struct orangefs_removexattr_request_s {
+ struct orangefs_object_kref refn;
+ __s32 key_sz;
+ __s32 __pad1;
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+};
+
+struct orangefs_op_cancel_s {
+ __u64 op_tag;
+};
+
+struct orangefs_fsync_request_s {
+ struct orangefs_object_kref refn;
+};
+
+enum orangefs_param_request_type {
+ ORANGEFS_PARAM_REQUEST_SET = 1,
+ ORANGEFS_PARAM_REQUEST_GET = 2
+};
+
+enum orangefs_param_request_op {
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS = 1,
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT = 2,
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT = 3,
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE = 4,
+ ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS = 5,
+ ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE = 6,
+ ORANGEFS_PARAM_REQUEST_OP_PERF_RESET = 7,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS = 8,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT = 9,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT = 10,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE = 11,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_TIMEOUT_MSECS = 12,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_HARD_LIMIT = 13,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_SOFT_LIMIT = 14,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_RECLAIM_PERCENTAGE = 15,
+ ORANGEFS_PARAM_REQUEST_OP_CLIENT_DEBUG = 16,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS = 17,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT = 18,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT = 19,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE = 20,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS = 21,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT = 22,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
+ ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+};
+
+struct orangefs_param_request_s {
+ enum orangefs_param_request_type type;
+ enum orangefs_param_request_op op;
+ __s64 value;
+ char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
+};
+
+enum orangefs_perf_count_request_type {
+ ORANGEFS_PERF_COUNT_REQUEST_ACACHE = 1,
+ ORANGEFS_PERF_COUNT_REQUEST_NCACHE = 2,
+ ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE = 3,
+};
+
+struct orangefs_perf_count_request_s {
+ enum orangefs_perf_count_request_type type;
+ __s32 __pad1;
+};
+
+struct orangefs_fs_key_request_s {
+ __s32 fsid;
+ __s32 __pad1;
+};
+
+struct orangefs_upcall_s {
+ __s32 type;
+ __u32 uid;
+ __u32 gid;
+ int pid;
+ int tgid;
+ /* Trailers unused but must be retained for protocol compatibility. */
+ __s64 trailer_size;
+ char *trailer_buf;
+
+ union {
+ struct orangefs_io_request_s io;
+ struct orangefs_lookup_request_s lookup;
+ struct orangefs_create_request_s create;
+ struct orangefs_symlink_request_s sym;
+ struct orangefs_getattr_request_s getattr;
+ struct orangefs_setattr_request_s setattr;
+ struct orangefs_remove_request_s remove;
+ struct orangefs_mkdir_request_s mkdir;
+ struct orangefs_readdir_request_s readdir;
+ struct orangefs_readdirplus_request_s readdirplus;
+ struct orangefs_rename_request_s rename;
+ struct orangefs_statfs_request_s statfs;
+ struct orangefs_truncate_request_s truncate;
+ struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+ struct orangefs_fs_mount_request_s fs_mount;
+ struct orangefs_fs_umount_request_s fs_umount;
+ struct orangefs_getxattr_request_s getxattr;
+ struct orangefs_setxattr_request_s setxattr;
+ struct orangefs_listxattr_request_s listxattr;
+ struct orangefs_removexattr_request_s removexattr;
+ struct orangefs_op_cancel_s cancel;
+ struct orangefs_fsync_request_s fsync;
+ struct orangefs_param_request_s param;
+ struct orangefs_perf_count_request_s perf_count;
+ struct orangefs_fs_key_request_s fs_key;
+ } req;
+};
+
+#endif /* __UPCALL_H */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
new file mode 100644
index 000000000000..31635bc303fe
--- /dev/null
+++ b/fs/orangefs/waitqueue.c
@@ -0,0 +1,357 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ * (C) 2011 Omnibond Systems
+ *
+ * Changes by Acxiom Corporation to implement generic service_operation()
+ * function, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * In-kernel waitqueue operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool);
+static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *);
+
+/*
+ * What we do in this function is to walk the list of operations that are
+ * present in the request queue and mark them as purged.
+ * NOTE: This is called from the device close after client-core has
+ * guaranteed that no new operations could appear on the list since the
+ * client-core is anyway going to exit.
+ */
+void purge_waiting_ops(void)
+{
+ struct orangefs_kernel_op_s *op;
+
+ spin_lock(&orangefs_request_list_lock);
+ list_for_each_entry(op, &orangefs_request_list, list) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "pvfs2-client-core: purging op tag %llu %s\n",
+ llu(op->tag),
+ get_opname_string(op));
+ set_op_state_purged(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ }
+ spin_unlock(&orangefs_request_list_lock);
+}
+
+/*
+ * submits a ORANGEFS operation and waits for it to complete
+ *
+ * Note op->downcall.status will contain the status of the operation (in
+ * errno format), whether provided by pvfs2-client or a result of failure to
+ * service the operation. If the caller wishes to distinguish, then
+ * op->state can be checked to see if it was serviced or not.
+ *
+ * Returns contents of op->downcall.status for convenience
+ */
+int service_operation(struct orangefs_kernel_op_s *op,
+ const char *op_name,
+ int flags)
+{
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ int ret = 0;
+
+ DEFINE_WAIT(wait_entry);
+
+ op->upcall.tgid = current->tgid;
+ op->upcall.pid = current->pid;
+
+retry_servicing:
+ op->downcall.status = 0;
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: %s op:%p: process:%s: pid:%d:\n",
+ __func__,
+ op_name,
+ op,
+ current->comm,
+ current->pid);
+
+ /*
+ * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid
+ * acquiring the request_mutex because we're servicing a
+ * high priority remount operation and the request_mutex is
+ * already taken.
+ */
+ if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
+ if (flags & ORANGEFS_OP_INTERRUPTIBLE)
+ ret = mutex_lock_interruptible(&request_mutex);
+ else
+ ret = mutex_lock_killable(&request_mutex);
+ /*
+ * check to see if we were interrupted while waiting for
+ * mutex
+ */
+ if (ret < 0) {
+ op->downcall.status = ret;
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: service_operation interrupted.\n",
+ __func__);
+ return ret;
+ }
+ }
+
+ /* queue up the operation */
+ spin_lock(&orangefs_request_list_lock);
+ spin_lock(&op->lock);
+ set_op_state_waiting(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ /* add high priority remount op to the front of the line. */
+ if (flags & ORANGEFS_OP_PRIORITY)
+ list_add(&op->list, &orangefs_request_list);
+ else
+ list_add_tail(&op->list, &orangefs_request_list);
+ spin_unlock(&op->lock);
+ wake_up_interruptible(&orangefs_request_list_waitq);
+ if (!__is_daemon_in_service()) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s:client core is NOT in service.\n",
+ __func__);
+ timeout = op_timeout_secs * HZ;
+ }
+ spin_unlock(&orangefs_request_list_lock);
+
+ if (!(flags & ORANGEFS_OP_NO_MUTEX))
+ mutex_unlock(&request_mutex);
+
+ ret = wait_for_matching_downcall(op, timeout,
+ flags & ORANGEFS_OP_INTERRUPTIBLE);
+
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: wait_for_matching_downcall returned %d for %p\n",
+ __func__,
+ ret,
+ op);
+
+ /* got matching downcall; make sure status is in errno format */
+ if (!ret) {
+ spin_unlock(&op->lock);
+ op->downcall.status =
+ orangefs_normalize_to_errno(op->downcall.status);
+ ret = op->downcall.status;
+ goto out;
+ }
+
+ /* failed to get matching downcall */
+ if (ret == -ETIMEDOUT) {
+ gossip_err("%s: %s -- wait timed out; aborting attempt.\n",
+ __func__,
+ op_name);
+ }
+
+ /*
+ * remove a waiting op from the request list or
+ * remove an in-progress op from the in-progress list.
+ */
+ orangefs_clean_up_interrupted_operation(op);
+
+ op->downcall.status = ret;
+ /* retry if operation has not been serviced and if requested */
+ if (ret == -EAGAIN) {
+ op->attempts++;
+ timeout = op_timeout_secs * HZ;
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "orangefs: tag %llu (%s)"
+ " -- operation to be retried (%d attempt)\n",
+ llu(op->tag),
+ op_name,
+ op->attempts);
+
+ /*
+ * io ops (ops that use the shared memory buffer) have
+ * to be returned to their caller for a retry. Other ops
+ * can just be recycled here.
+ */
+ if (!op->uses_shared_memory)
+ goto retry_servicing;
+ }
+
+out:
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: %s returning: %d for %p.\n",
+ __func__,
+ op_name,
+ ret,
+ op);
+ return ret;
+}
+
+/* This can get called on an I/O op if it had a bad service_operation. */
+bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op)
+{
+ u64 tag = op->tag;
+ if (!op_state_in_progress(op))
+ return false;
+
+ op->slot_to_free = op->upcall.req.io.buf_index;
+ memset(&op->upcall, 0, sizeof(op->upcall));
+ memset(&op->downcall, 0, sizeof(op->downcall));
+ op->upcall.type = ORANGEFS_VFS_OP_CANCEL;
+ op->upcall.req.cancel.op_tag = tag;
+ op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+ op->downcall.status = -1;
+ orangefs_new_tag(op);
+
+ spin_lock(&orangefs_request_list_lock);
+ /* orangefs_request_list_lock is enough of a barrier here */
+ if (!__is_daemon_in_service()) {
+ spin_unlock(&orangefs_request_list_lock);
+ return false;
+ }
+ spin_lock(&op->lock);
+ set_op_state_waiting(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ list_add(&op->list, &orangefs_request_list);
+ spin_unlock(&op->lock);
+ spin_unlock(&orangefs_request_list_lock);
+
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "Attempting ORANGEFS operation cancellation of tag %llu\n",
+ llu(tag));
+ return true;
+}
+
+/*
+ * Change an op to the "given up" state and remove it from its list.
+ */
+static void
+ orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
+{
+ /*
+ * handle interrupted cases depending on what state we were in when
+ * the interruption is detected.
+ *
+ * Called with op->lock held.
+ */
+
+ /*
+ * List manipulation code elsewhere will ignore ops that
+ * have been given up upon.
+ */
+ op->op_state |= OP_VFS_STATE_GIVEN_UP;
+
+ if (list_empty(&op->list)) {
+ /* caught copying to/from daemon */
+ BUG_ON(op_state_serviced(op));
+ spin_unlock(&op->lock);
+ wait_for_completion(&op->waitq);
+ } else if (op_state_waiting(op)) {
+ /*
+ * upcall hasn't been read; remove op from upcall request
+ * list.
+ */
+ spin_unlock(&op->lock);
+ spin_lock(&orangefs_request_list_lock);
+ list_del_init(&op->list);
+ spin_unlock(&orangefs_request_list_lock);
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "Interrupted: Removed op %p from request_list\n",
+ op);
+ } else if (op_state_in_progress(op)) {
+ /* op must be removed from the in progress htable */
+ spin_unlock(&op->lock);
+ spin_lock(&htable_ops_in_progress_lock);
+ list_del_init(&op->list);
+ spin_unlock(&htable_ops_in_progress_lock);
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "Interrupted: Removed op %p"
+ " from htable_ops_in_progress\n",
+ op);
+ } else {
+ spin_unlock(&op->lock);
+ gossip_err("interrupted operation is in a weird state 0x%x\n",
+ op->op_state);
+ }
+ reinit_completion(&op->waitq);
+}
+
+/*
+ * Sleeps on waitqueue waiting for matching downcall.
+ * If client-core finishes servicing, then we are good to go.
+ * else if client-core exits, we get woken up here, and retry with a timeout
+ *
+ * When this call returns to the caller, the specified op will no
+ * longer be in either the in_progress hash table or on the request list.
+ *
+ * Returns 0 on success and -errno on failure
+ * Errors are:
+ * EAGAIN in case we want the caller to requeue and try again..
+ * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this
+ * operation since client-core seems to be exiting too often
+ * or if we were interrupted.
+ *
+ * Returns with op->lock taken.
+ */
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
+ long timeout,
+ bool interruptible)
+{
+ long n;
+
+ /*
+ * There's a "schedule_timeout" inside of these wait
+ * primitives, during which the op is out of the hands of the
+ * user process that needs something done and is being
+ * manipulated by the client-core process.
+ */
+ if (interruptible)
+ n = wait_for_completion_interruptible_timeout(&op->waitq,
+ timeout);
+ else
+ n = wait_for_completion_killable_timeout(&op->waitq, timeout);
+
+ spin_lock(&op->lock);
+
+ if (op_state_serviced(op))
+ return 0;
+
+ if (unlikely(n < 0)) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: operation interrupted, tag %llu, %p\n",
+ __func__,
+ llu(op->tag),
+ op);
+ return -EINTR;
+ }
+ if (op_state_purged(op)) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: operation purged, tag %llu, %p, %d\n",
+ __func__,
+ llu(op->tag),
+ op,
+ op->attempts);
+ return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ?
+ -EAGAIN :
+ -EIO;
+ }
+ /* must have timed out, then... */
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: operation timed out, tag %llu, %p, %d)\n",
+ __func__,
+ llu(op->tag),
+ op,
+ op->attempts);
+ return -ETIMEDOUT;
+}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
new file mode 100644
index 000000000000..ef5da7538cd5
--- /dev/null
+++ b/fs/orangefs/xattr.c
@@ -0,0 +1,545 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS extended attribute operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+#define SYSTEM_ORANGEFS_KEY "system.pvfs2."
+#define SYSTEM_ORANGEFS_KEY_LEN 13
+
+/*
+ * this function returns
+ * 0 if the key corresponding to name is not meant to be printed as part
+ * of a listxattr.
+ * 1 if the key corresponding to name is meant to be returned as part of
+ * a listxattr.
+ * The ones that start SYSTEM_ORANGEFS_KEY are the ones to avoid printing.
+ */
+static int is_reserved_key(const char *key, size_t size)
+{
+
+ if (size < SYSTEM_ORANGEFS_KEY_LEN)
+ return 1;
+
+ return strncmp(key, SYSTEM_ORANGEFS_KEY, SYSTEM_ORANGEFS_KEY_LEN) ? 1 : 0;
+}
+
+static inline int convert_to_internal_xattr_flags(int setxattr_flags)
+{
+ int internal_flag = 0;
+
+ if (setxattr_flags & XATTR_REPLACE) {
+ /* Attribute must exist! */
+ internal_flag = ORANGEFS_XATTR_REPLACE;
+ } else if (setxattr_flags & XATTR_CREATE) {
+ /* Attribute must not exist */
+ internal_flag = ORANGEFS_XATTR_CREATE;
+ }
+ return internal_flag;
+}
+
+
+/*
+ * Tries to get a specified key's attributes of a given
+ * file into a user-specified buffer. Note that the getxattr
+ * interface allows for the users to probe the size of an
+ * extended attribute by passing in a value of 0 to size.
+ * Thus our return value is always the size of the attribute
+ * unless the key does not exist for the file and/or if
+ * there were errors in fetching the attribute value.
+ */
+ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
+ const char *name, void *buffer, size_t size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op = NULL;
+ ssize_t ret = -ENOMEM;
+ ssize_t length = 0;
+ int fsuid;
+ int fsgid;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "%s: prefix %s name %s, buffer_size %zd\n",
+ __func__, prefix, name, size);
+
+ if (name == NULL || (size > 0 && buffer == NULL)) {
+ gossip_err("orangefs_inode_getxattr: bogus NULL pointers\n");
+ return -EINVAL;
+ }
+ if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err("Invalid key length (%d)\n",
+ (int)(strlen(name) + strlen(prefix)));
+ return -EINVAL;
+ }
+
+ fsuid = from_kuid(current_user_ns(), current_fsuid());
+ fsgid = from_kgid(current_user_ns(), current_fsgid());
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "getxattr on inode %pU, name %s "
+ "(uid %o, gid %o)\n",
+ get_khandle_from_ino(inode),
+ name,
+ fsuid,
+ fsgid);
+
+ down_read(&orangefs_inode->xattr_sem);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+ new_op->upcall.req.getxattr.refn = orangefs_inode->refn;
+ ret = snprintf((char *)new_op->upcall.req.getxattr.key,
+ ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name);
+
+ /*
+ * NOTE: Although keys are meant to be NULL terminated textual
+ * strings, I am going to explicitly pass the length just in case
+ * we change this later on...
+ */
+ new_op->upcall.req.getxattr.key_sz = ret + 1;
+
+ ret = service_operation(new_op, "orangefs_inode_getxattr",
+ get_interruptible_flag(inode));
+ if (ret != 0) {
+ if (ret == -ENOENT) {
+ ret = -ENODATA;
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_getxattr: inode %pU key %s"
+ " does not exist!\n",
+ get_khandle_from_ino(inode),
+ (char *)new_op->upcall.req.getxattr.key);
+ }
+ goto out_release_op;
+ }
+
+ /*
+ * Length returned includes null terminator.
+ */
+ length = new_op->downcall.resp.getxattr.val_sz;
+
+ /*
+ * Just return the length of the queried attribute.
+ */
+ if (size == 0) {
+ ret = length;
+ goto out_release_op;
+ }
+
+ /*
+ * Check to see if key length is > provided buffer size.
+ */
+ if (length > size) {
+ ret = -ERANGE;
+ goto out_release_op;
+ }
+
+ memset(buffer, 0, size);
+ memcpy(buffer, new_op->downcall.resp.getxattr.val, length);
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_getxattr: inode %pU "
+ "key %s key_sz %d, val_len %d\n",
+ get_khandle_from_ino(inode),
+ (char *)new_op->
+ upcall.req.getxattr.key,
+ (int)new_op->
+ upcall.req.getxattr.key_sz,
+ (int)ret);
+
+ ret = length;
+
+out_release_op:
+ op_release(new_op);
+out_unlock:
+ up_read(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+static int orangefs_inode_removexattr(struct inode *inode,
+ const char *prefix,
+ const char *name,
+ int flags)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int ret = -ENOMEM;
+
+ down_write(&orangefs_inode->xattr_sem);
+ new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+ new_op->upcall.req.removexattr.refn = orangefs_inode->refn;
+ /*
+ * NOTE: Although keys are meant to be NULL terminated
+ * textual strings, I am going to explicitly pass the
+ * length just in case we change this later on...
+ */
+ ret = snprintf((char *)new_op->upcall.req.removexattr.key,
+ ORANGEFS_MAX_XATTR_NAMELEN,
+ "%s%s",
+ (prefix ? prefix : ""),
+ name);
+ new_op->upcall.req.removexattr.key_sz = ret + 1;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_removexattr: key %s, key_sz %d\n",
+ (char *)new_op->upcall.req.removexattr.key,
+ (int)new_op->upcall.req.removexattr.key_sz);
+
+ ret = service_operation(new_op,
+ "orangefs_inode_removexattr",
+ get_interruptible_flag(inode));
+ if (ret == -ENOENT) {
+ /*
+ * Request to replace a non-existent attribute is an error.
+ */
+ if (flags & XATTR_REPLACE)
+ ret = -ENODATA;
+ else
+ ret = 0;
+ }
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_removexattr: returning %d\n", ret);
+
+ op_release(new_op);
+out_unlock:
+ up_write(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+/*
+ * Tries to set an attribute for a given key on a file.
+ *
+ * Returns a -ve number on error and 0 on success. Key is text, but value
+ * can be binary!
+ */
+int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
+ const char *name, const void *value, size_t size, int flags)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int internal_flag = 0;
+ int ret = -ENOMEM;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "%s: prefix %s, name %s, buffer_size %zd\n",
+ __func__, prefix, name, size);
+
+ if (size < 0 ||
+ size >= ORANGEFS_MAX_XATTR_VALUELEN ||
+ flags < 0) {
+ gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
+ (int)size,
+ flags);
+ return -EINVAL;
+ }
+
+ if (name == NULL ||
+ (size > 0 && value == NULL)) {
+ gossip_err("orangefs_inode_setxattr: bogus NULL pointers!\n");
+ return -EINVAL;
+ }
+
+ internal_flag = convert_to_internal_xattr_flags(flags);
+
+ if (prefix) {
+ if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err
+ ("orangefs_inode_setxattr: bogus key size (%d)\n",
+ (int)(strlen(name) + strlen(prefix)));
+ return -EINVAL;
+ }
+ } else {
+ if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err
+ ("orangefs_inode_setxattr: bogus key size (%d)\n",
+ (int)(strlen(name)));
+ return -EINVAL;
+ }
+ }
+
+ /* This is equivalent to a removexattr */
+ if (size == 0 && value == NULL) {
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "removing xattr (%s%s)\n",
+ prefix,
+ name);
+ return orangefs_inode_removexattr(inode, prefix, name, flags);
+ }
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "setxattr on inode %pU, name %s\n",
+ get_khandle_from_ino(inode),
+ name);
+
+ down_write(&orangefs_inode->xattr_sem);
+ new_op = op_alloc(ORANGEFS_VFS_OP_SETXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+
+ new_op->upcall.req.setxattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.setxattr.flags = internal_flag;
+ /*
+ * NOTE: Although keys are meant to be NULL terminated textual
+ * strings, I am going to explicitly pass the length just in
+ * case we change this later on...
+ */
+ ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key,
+ ORANGEFS_MAX_XATTR_NAMELEN,
+ "%s%s",
+ prefix, name);
+ new_op->upcall.req.setxattr.keyval.key_sz = ret + 1;
+ memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
+ new_op->upcall.req.setxattr.keyval.val_sz = size;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_setxattr: key %s, key_sz %d "
+ " value size %zd\n",
+ (char *)new_op->upcall.req.setxattr.keyval.key,
+ (int)new_op->upcall.req.setxattr.keyval.key_sz,
+ size);
+
+ ret = service_operation(new_op,
+ "orangefs_inode_setxattr",
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_setxattr: returning %d\n",
+ ret);
+
+ /* when request is serviced properly, free req op struct */
+ op_release(new_op);
+out_unlock:
+ up_write(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+/*
+ * Tries to get a specified object's keys into a user-specified buffer of a
+ * given size. Note that like the previous instances of xattr routines, this
+ * also allows you to pass in a NULL pointer and 0 size to probe the size for
+ * subsequent memory allocations. Thus our return value is always the size of
+ * all the keys unless there were errors in fetching the keys!
+ */
+ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ __u64 token = ORANGEFS_ITERATE_START;
+ ssize_t ret = -ENOMEM;
+ ssize_t total = 0;
+ int count_keys = 0;
+ int key_size;
+ int i = 0;
+ int returned_count = 0;
+
+ if (size > 0 && buffer == NULL) {
+ gossip_err("%s: bogus NULL pointers\n", __func__);
+ return -EINVAL;
+ }
+ if (size < 0) {
+ gossip_err("Invalid size (%d)\n", (int)size);
+ return -EINVAL;
+ }
+
+ down_read(&orangefs_inode->xattr_sem);
+ new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+ if (buffer && size > 0)
+ memset(buffer, 0, size);
+
+try_again:
+ key_size = 0;
+ new_op->upcall.req.listxattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.listxattr.token = token;
+ new_op->upcall.req.listxattr.requested_count =
+ (size == 0) ? 0 : ORANGEFS_MAX_XATTR_LISTLEN;
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto done;
+
+ if (size == 0) {
+ /*
+ * This is a bit of a big upper limit, but I did not want to
+ * spend too much time getting this correct, since users end
+ * up allocating memory rather than us...
+ */
+ total = new_op->downcall.resp.listxattr.returned_count *
+ ORANGEFS_MAX_XATTR_NAMELEN;
+ goto done;
+ }
+
+ returned_count = new_op->downcall.resp.listxattr.returned_count;
+ if (returned_count < 0 ||
+ returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+ gossip_err("%s: impossible value for returned_count:%d:\n",
+ __func__,
+ returned_count);
+ ret = -EIO;
+ goto done;
+ }
+
+ /*
+ * Check to see how much can be fit in the buffer. Fit only whole keys.
+ */
+ for (i = 0; i < returned_count; i++) {
+ if (new_op->downcall.resp.listxattr.lengths[i] < 0 ||
+ new_op->downcall.resp.listxattr.lengths[i] >
+ ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err("%s: impossible value for lengths[%d]\n",
+ __func__,
+ new_op->downcall.resp.listxattr.lengths[i]);
+ ret = -EIO;
+ goto done;
+ }
+ if (total + new_op->downcall.resp.listxattr.lengths[i] > size)
+ goto done;
+
+ /*
+ * Since many dumb programs try to setxattr() on our reserved
+ * xattrs this is a feeble attempt at defeating those by not
+ * listing them in the output of listxattr.. sigh
+ */
+ if (is_reserved_key(new_op->downcall.resp.listxattr.key +
+ key_size,
+ new_op->downcall.resp.
+ listxattr.lengths[i])) {
+ gossip_debug(GOSSIP_XATTR_DEBUG, "Copying key %d -> %s\n",
+ i, new_op->downcall.resp.listxattr.key +
+ key_size);
+ memcpy(buffer + total,
+ new_op->downcall.resp.listxattr.key + key_size,
+ new_op->downcall.resp.listxattr.lengths[i]);
+ total += new_op->downcall.resp.listxattr.lengths[i];
+ count_keys++;
+ } else {
+ gossip_debug(GOSSIP_XATTR_DEBUG, "[RESERVED] key %d -> %s\n",
+ i, new_op->downcall.resp.listxattr.key +
+ key_size);
+ }
+ key_size += new_op->downcall.resp.listxattr.lengths[i];
+ }
+
+ /*
+ * Since the buffer was large enough, we might have to continue
+ * fetching more keys!
+ */
+ token = new_op->downcall.resp.listxattr.token;
+ if (token != ORANGEFS_ITERATE_END)
+ goto try_again;
+
+done:
+ gossip_debug(GOSSIP_XATTR_DEBUG, "%s: returning %d"
+ " [size of buffer %ld] (filled in %d keys)\n",
+ __func__,
+ ret ? (int)ret : (int)total,
+ (long)size,
+ count_keys);
+ op_release(new_op);
+ if (ret == 0)
+ ret = total;
+out_unlock:
+ up_read(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+static int orangefs_xattr_set_default(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ const void *buffer,
+ size_t size,
+ int flags)
+{
+ return orangefs_inode_setxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ name,
+ buffer,
+ size,
+ flags);
+}
+
+static int orangefs_xattr_get_default(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ return orangefs_inode_getxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ name,
+ buffer,
+ size);
+
+}
+
+static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ const void *buffer,
+ size_t size,
+ int flags)
+{
+ return orangefs_inode_setxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+ name,
+ buffer,
+ size,
+ flags);
+}
+
+static int orangefs_xattr_get_trusted(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ return orangefs_inode_getxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+ name,
+ buffer,
+ size);
+}
+
+static struct xattr_handler orangefs_xattr_trusted_handler = {
+ .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+ .get = orangefs_xattr_get_trusted,
+ .set = orangefs_xattr_set_trusted,
+};
+
+static struct xattr_handler orangefs_xattr_default_handler = {
+ /*
+ * NOTE: this is set to be the empty string.
+ * so that all un-prefixed xattrs keys get caught
+ * here!
+ */
+ .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ .get = orangefs_xattr_get_default,
+ .set = orangefs_xattr_set_default,
+};
+
+const struct xattr_handler *orangefs_xattr_handlers[] = {
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ &orangefs_xattr_trusted_handler,
+ &orangefs_xattr_default_handler,
+ NULL
+};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d894e7cd9a86..cc514da6f3e7 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -7,6 +7,7 @@
* the Free Software Foundation.
*/
+#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
@@ -16,10 +17,41 @@
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/fdtable.h>
+#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+static bool __read_mostly ovl_check_copy_up;
+module_param_named(check_copy_up, ovl_check_copy_up, bool,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(ovl_check_copy_up,
+ "Warn on copy-up when causing process also has a R/O fd open");
+
+static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
+{
+ const struct dentry *dentry = data;
+
+ if (f->f_inode == d_inode(dentry))
+ pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
+ f, fd, current->pid, current->comm);
+ return 0;
+}
+
+/*
+ * Check the fds open by this process and warn if something like the following
+ * scenario is about to occur:
+ *
+ * fd1 = open("foo", O_RDONLY);
+ * fd2 = open("foo", O_RDWR);
+ */
+static void ovl_do_check_copy_up(struct dentry *dentry)
+{
+ if (ovl_check_copy_up)
+ iterate_fd(current->files, 0, ovl_check_fd, dentry);
+}
+
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
@@ -235,6 +267,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (S_ISREG(stat->mode)) {
struct path upperpath;
+
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
@@ -309,6 +342,8 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
if (WARN_ON(!workdir))
return -EROFS;
+ ovl_do_check_copy_up(lowerpath->dentry);
+
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ed95272d57a6..b3fc0a35bf62 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -596,21 +596,25 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
- struct dentry *upper = ovl_dentry_upper(dentry);
+ struct dentry *upper;
int err;
inode_lock_nested(dir, I_MUTEX_PARENT);
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_unlock;
+
err = -ESTALE;
- if (upper->d_parent == upperdir) {
- /* Don't let d_delete() think it can reset d_inode */
- dget(upper);
+ if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
- dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
+ dput(upper);
/*
* Keeping this dentry hashed would mean having to release
@@ -618,7 +622,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
+ if (!err)
+ d_drop(dentry);
+out_unlock:
inode_unlock(dir);
return err;
@@ -713,7 +719,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct dentry *trap;
bool old_opaque;
bool new_opaque;
- bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
@@ -839,29 +844,38 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
trap = lock_rename(new_upperdir, old_upperdir);
- olddentry = ovl_dentry_upper(old);
- newdentry = ovl_dentry_upper(new);
- if (newdentry) {
+
+ olddentry = lookup_one_len(old->d_name.name, old_upperdir,
+ old->d_name.len);
+ err = PTR_ERR(olddentry);
+ if (IS_ERR(olddentry))
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (olddentry != ovl_dentry_upper(old))
+ goto out_dput_old;
+
+ newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_dput_old;
+
+ err = -ESTALE;
+ if (ovl_dentry_upper(new)) {
if (opaquedir) {
- newdentry = opaquedir;
- opaquedir = NULL;
+ if (newdentry != opaquedir)
+ goto out_dput;
} else {
- dget(newdentry);
+ if (newdentry != ovl_dentry_upper(new))
+ goto out_dput;
}
} else {
- new_create = true;
- newdentry = lookup_one_len(new->d_name.name, new_upperdir,
- new->d_name.len);
- err = PTR_ERR(newdentry);
- if (IS_ERR(newdentry))
- goto out_unlock;
+ if (!d_is_negative(newdentry) &&
+ (!new_opaque || !ovl_is_whiteout(newdentry)))
+ goto out_dput;
}
- err = -ESTALE;
- if (olddentry->d_parent != old_upperdir)
- goto out_dput;
- if (newdentry->d_parent != new_upperdir)
- goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
@@ -903,6 +917,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
@@ -917,6 +938,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
out_dput:
dput(newdentry);
+out_dput_old:
+ dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 49e204560655..a4ff5d0d7db9 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
}
ovl_drop_write(dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 99b4168c36ff..6a7090f4a441 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -166,6 +166,7 @@ extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
+int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index fdaf28f75e12..6ec1e43a9a54 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,13 +36,14 @@ struct ovl_dir_cache {
struct ovl_readdir_data {
struct dir_context ctx;
- bool is_merge;
+ bool is_lowest;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
+ bool d_type_supported;
};
struct ovl_dir_file {
@@ -139,9 +140,9 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
return 0;
}
-static int ovl_fill_lower(struct ovl_readdir_data *rdd,
- const char *name, int namelen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
+ const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
@@ -193,10 +194,10 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
- if (!rdd->is_merge)
+ if (!rdd->is_lowest)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
- return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+ return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
@@ -289,7 +290,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
- .is_merge = false,
+ .is_lowest = false,
};
int idx, next;
@@ -306,7 +307,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
- rdd.is_merge = true;
+ rdd.is_lowest = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
@@ -577,3 +578,39 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
}
inode_unlock(upper->d_inode);
}
+
+static int ovl_check_d_type(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ /* Even if d_type is not supported, DT_DIR is returned for . and .. */
+ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
+ return 0;
+
+ if (d_type != DT_UNKNOWN)
+ rdd->d_type_supported = true;
+
+ return 0;
+}
+
+/*
+ * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
+ * if error is encountered.
+ */
+int ovl_check_d_type_supported(struct path *realpath)
+{
+ int err;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_check_d_type,
+ .d_type_supported = false,
+ };
+
+ err = ovl_dir_read(realpath, &rdd);
+ if (err)
+ return err;
+
+ return rdd.d_type_supported;
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 8d826bd56b26..ef64984c9bbc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = {
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
@@ -933,7 +936,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
if (!ufs->config.lowerdir) {
- pr_err("overlayfs: missing 'lowerdir'\n");
+ if (!silent)
+ pr_err("overlayfs: missing 'lowerdir'\n");
goto out_free_config;
}
@@ -1025,6 +1029,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_RDONLY;
ufs->workdir = NULL;
}
+
+ /*
+ * Upper should support d_type, else whiteouts are visible.
+ * Given workdir and upper are on same fs, we can do
+ * iterate_dir() on workdir.
+ */
+ err = ovl_check_d_type_supported(&workpath);
+ if (err < 0)
+ goto out_put_workdir;
+
+ if (!err) {
+ pr_err("overlayfs: upper fs needs to support d_type.\n");
+ err = -EINVAL;
+ goto out_put_workdir;
+ }
}
err = -ENOMEM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4f764c2ac1a5..b1755b23893e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -434,7 +434,7 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
&& !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
- seq_putc(m, '0');
+ seq_puts(m, "0\n");
return 0;
}
@@ -2158,6 +2158,7 @@ static const struct file_operations proc_map_files_operations = {
.llseek = default_llseek,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
struct timers_private {
struct pid *pid;
struct task_struct *task;
@@ -2256,6 +2257,73 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
+#endif
+
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ u64 slack_ns;
+ int err;
+
+ err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+ if (err < 0)
+ return err;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+ task_lock(p);
+ if (slack_ns == 0)
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ else
+ p->timer_slack_ns = slack_ns;
+ task_unlock(p);
+ } else
+ count = -EPERM;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+ int err = 0;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+ task_lock(p);
+ seq_printf(m, "%llu\n", p->timer_slack_ns);
+ task_unlock(p);
+ } else
+ err = -EPERM;
+
+ put_task_struct(p);
+
+ return err;
+}
+
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timerslack_ns_show, inode);
+}
+
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+ .open = timerslack_ns_open,
+ .read = seq_read,
+ .write = timerslack_ns_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2831,6 +2899,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
#endif
+ REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index df4661abadc4..83720460c5bc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
unsigned long committed;
long cached;
long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
- struct zone *zone;
int lru;
/*
@@ -51,33 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
- for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
- */
- available = i.freeram - totalreserve_pages;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
- */
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
-
- if (available < 0)
- available = 0;
+ available = si_mem_available();
/*
* Tagged format, for easy grepping and expansion.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 276f12431dbf..72cb26f85d58 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
&userns_operations,
#endif
&mntns_operations,
+#ifdef CONFIG_CGROUPS
+ &cgroupns_operations,
+#endif
};
static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b2855eea5405..712f1b9992cc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapcount() is not enough.
+ * simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapcount(page))
+ if (!PageSlab(page) && page_mapped(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
@@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page)
*/
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ u |= 1 << KPF_BUDDY;
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
@@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
+ if (PageTail(page) && PageSlab(compound_head(page)))
+ u |= 1 << KPF_SLAB;
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fa95ab2d3674..9df431642042 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -660,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /* These come out via ProtectionKey: */
+ [ilog2(VM_PKEY_BIT0)] = "",
+ [ilog2(VM_PKEY_BIT1)] = "",
+ [ilog2(VM_PKEY_BIT2)] = "",
+ [ilog2(VM_PKEY_BIT3)] = "",
+#endif
};
size_t i;
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
+ if (!mnemonics[i][0])
+ continue;
if (vma->vm_flags & (1UL << i)) {
seq_printf(m, "%c%c ",
mnemonics[i][0], mnemonics[i][1]);
@@ -702,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
#endif /* HUGETLB_PAGE */
+void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
+{
+}
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
@@ -783,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+ arch_show_smap(m, vma);
show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
return 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4e61388ec03d..55bb57e6a30d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -231,7 +231,9 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
list_for_each_entry(m, &vmcore_list, list) {
if (*fpos < m->offset + m->size) {
- tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - *fpos,
+ buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
if (tmp < 0)
@@ -461,7 +463,8 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (start < m->offset + m->size) {
u64 paddr = 0;
- tsz = min_t(size_t, m->offset + m->size - start, size);
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - start, size);
paddr = m->paddr + start - m->offset;
if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
paddr >> PAGE_SHIFT, tsz,
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 2256e7e23e67..3f1190d18991 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -199,6 +199,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
if (sb->s_op->show_devname) {
seq_puts(m, "device ");
err = sb->s_op->show_devname(m, mnt_path.dentry);
+ if (err)
+ goto out;
} else {
if (r->mnt_devname) {
seq_puts(m, "device ");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 319c3a60cfa5..bd9812e83461 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -55,8 +55,8 @@ static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
MODULE_PARM_DESC(pmsg_size, "size of user space message log");
-static ulong mem_address;
-module_param(mem_address, ulong, 0400);
+static unsigned long long mem_address;
+module_param(mem_address, ullong, 0400);
MODULE_PARM_DESC(mem_address,
"start of reserved RAM used to store oops/panic logs");
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3c3b81bb6dfe..ba827daea5a0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -411,6 +411,8 @@ int dquot_acquire(struct dquot *dquot)
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
goto out_iolock;
+ /* Make sure flags update is visible after dquot has been filled */
+ smp_mb__before_atomic();
set_bit(DQ_READ_B, &dquot->dq_flags);
/* Instantiate dquot if needed */
if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
@@ -427,6 +429,11 @@ int dquot_acquire(struct dquot *dquot)
goto out_iolock;
}
}
+ /*
+ * Make sure flags update is visible after on-disk struct has been
+ * allocated. Paired with smp_rmb() in dqget().
+ */
+ smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
mutex_unlock(&dqopt->dqio_mutex);
@@ -887,6 +894,11 @@ we_slept:
goto out;
}
}
+ /*
+ * Make sure following reads see filled structure - paired with
+ * smp_mb__before_atomic() in dquot_acquire().
+ */
+ smp_rmb();
#ifdef CONFIG_QUOTA_DEBUG
BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
#endif
@@ -1398,7 +1410,7 @@ static int dquot_active(const struct inode *inode)
static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot **dquots, *got[MAXQUOTAS];
+ struct dquot **dquots, *got[MAXQUOTAS] = {};
struct super_block *sb = inode->i_sb;
qsize_t rsv;
int ret = 0;
@@ -1415,7 +1427,6 @@ static int __dquot_initialize(struct inode *inode, int type)
int rc;
struct dquot *dquot;
- got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
/*
@@ -2031,6 +2042,21 @@ int dquot_commit_info(struct super_block *sb, int type)
}
EXPORT_SYMBOL(dquot_commit_info);
+int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int err;
+
+ if (!dqopt->ops[qid->type]->get_next_id)
+ return -ENOSYS;
+ mutex_lock(&dqopt->dqio_mutex);
+ err = dqopt->ops[qid->type]->get_next_id(sb, qid);
+ mutex_unlock(&dqopt->dqio_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL(dquot_get_next_id);
+
/*
* Definitions of diskquota operations.
*/
@@ -2042,6 +2068,7 @@ const struct dquot_operations dquot_operations = {
.write_info = dquot_commit_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_next_id = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);
@@ -2430,9 +2457,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- inode_lock(d_inode(sb->s_root));
- dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
- inode_unlock(d_inode(sb->s_root));
+ dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -2565,6 +2590,27 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
}
EXPORT_SYMBOL(dquot_get_dqblk);
+int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
+ struct qc_dqblk *di)
+{
+ struct dquot *dquot;
+ int err;
+
+ if (!sb->dq_op->get_next_id)
+ return -ENOSYS;
+ err = sb->dq_op->get_next_id(sb, qid);
+ if (err < 0)
+ return err;
+ dquot = dqget(sb, *qid);
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
+ do_get_dqblk(dquot, di);
+ dqput(dquot);
+
+ return 0;
+}
+EXPORT_SYMBOL(dquot_get_next_dqblk);
+
#define VFS_QC_MASK \
(QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
@@ -2765,6 +2811,7 @@ const struct quotactl_ops dquot_quotactl_ops = {
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
.set_dqblk = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_ops);
@@ -2776,6 +2823,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
.set_dqblk = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 3746367098fd..0f10ee9892ce 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,7 +79,7 @@ unsigned int qtype_enforce_flag(int type)
return 0;
}
-static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+static int quota_quotaon(struct super_block *sb, int type, qid_t id,
struct path *path)
{
if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
@@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
return 0;
}
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ENOENT via ->get_nextdqblk
+ */
+static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct kqid qid;
+ struct qc_dqblk fdq;
+ struct if_nextdqblk idq;
+ int ret;
+
+ if (!sb->s_qcop->get_nextdqblk)
+ return -ENOSYS;
+ qid = make_kqid(current_user_ns(), type, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+ ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
+ if (ret)
+ return ret;
+ /* struct if_nextdqblk is a superset of struct if_dqblk */
+ copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
+ idq.dqb_id = from_kqid(current_user_ns(), qid);
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
+ return 0;
+}
+
static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
{
dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
@@ -625,6 +653,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
return ret;
}
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ENOENT via ->get_nextdqblk.
+ */
+static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct fs_disk_quota fdq;
+ struct qc_dqblk qdq;
+ struct kqid qid;
+ qid_t id_out;
+ int ret;
+
+ if (!sb->s_qcop->get_nextdqblk)
+ return -ENOSYS;
+ qid = make_kqid(current_user_ns(), type, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+ ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
+ if (ret)
+ return ret;
+ id_out = from_kqid(current_user_ns(), qid);
+ copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
+ if (copy_to_user(addr, &fdq, sizeof(fdq)))
+ return -EFAULT;
+ return ret;
+}
+
static int quota_rmxquota(struct super_block *sb, void __user *addr)
{
__u32 flags;
@@ -659,7 +715,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
switch (cmd) {
case Q_QUOTAON:
- return quota_quotaon(sb, type, cmd, id, path);
+ return quota_quotaon(sb, type, id, path);
case Q_QUOTAOFF:
return quota_quotaoff(sb, type);
case Q_GETFMT:
@@ -670,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return quota_setinfo(sb, type, addr);
case Q_GETQUOTA:
return quota_getquota(sb, type, id, addr);
+ case Q_GETNEXTQUOTA:
+ return quota_getnextquota(sb, type, id, addr);
case Q_SETQUOTA:
return quota_setquota(sb, type, id, addr);
case Q_SYNC:
@@ -690,6 +748,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return quota_setxquota(sb, type, id, addr);
case Q_XGETQUOTA:
return quota_getxquota(sb, type, id, addr);
+ case Q_XGETNEXTQUOTA:
+ return quota_getnextxquota(sb, type, id, addr);
case Q_XQUOTASYNC:
if (sb->s_flags & MS_RDONLY)
return -EROFS;
@@ -705,6 +765,11 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
/* Return 1 if 'cmd' will block on frozen filesystem */
static int quotactl_cmd_write(int cmd)
{
+ /*
+ * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access
+ * as dquot_acquire() may allocate space for new structure and OCFS2
+ * needs to increment on-disk use count.
+ */
switch (cmd) {
case Q_GETFMT:
case Q_GETINFO:
@@ -712,6 +777,7 @@ static int quotactl_cmd_write(int cmd)
case Q_XGETQSTAT:
case Q_XGETQSTATV:
case Q_XGETQUOTA:
+ case Q_XGETNEXTQUOTA:
case Q_XQUOTASYNC:
return 0;
}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 58efb83dec1c..0738972e8d3f 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,10 +22,9 @@ MODULE_LICENSE("GPL");
#define __QUOTA_QT_PARANOIA
-static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
{
unsigned int epb = info->dqi_usable_bs >> 2;
- qid_t id = from_kqid(&init_user_ns, qid);
depth = info->dqi_qtree_depth - depth - 1;
while (depth--)
@@ -33,6 +32,13 @@ static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
return id % epb;
}
+static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+{
+ qid_t id = from_kqid(&init_user_ns, qid);
+
+ return __get_index(info, id, depth);
+}
+
/* Number of entries in one blocks */
static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
{
@@ -668,3 +674,60 @@ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
return 0;
}
EXPORT_SYMBOL(qtree_release_dquot);
+
+static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
+ unsigned int blk, int depth)
+{
+ char *buf = getdqbuf(info->dqi_usable_bs);
+ __le32 *ref = (__le32 *)buf;
+ ssize_t ret;
+ unsigned int epb = info->dqi_usable_bs >> 2;
+ unsigned int level_inc = 1;
+ int i;
+
+ if (!buf)
+ return -ENOMEM;
+
+ for (i = depth; i < info->dqi_qtree_depth - 1; i++)
+ level_inc *= epb;
+
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ quota_error(info->dqi_sb,
+ "Can't read quota tree block %u", blk);
+ goto out_buf;
+ }
+ for (i = __get_index(info, *id, depth); i < epb; i++) {
+ if (ref[i] == cpu_to_le32(0)) {
+ *id += level_inc;
+ continue;
+ }
+ if (depth == info->dqi_qtree_depth - 1) {
+ ret = 0;
+ goto out_buf;
+ }
+ ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
+ if (ret != -ENOENT)
+ break;
+ }
+ if (i == epb) {
+ ret = -ENOENT;
+ goto out_buf;
+ }
+out_buf:
+ kfree(buf);
+ return ret;
+}
+
+int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid)
+{
+ qid_t id = from_kqid(&init_user_ns, *qid);
+ int ret;
+
+ ret = find_next_id(info, &id, QT_TREEOFF, 0);
+ if (ret < 0)
+ return ret;
+ *qid = make_kqid(&init_user_ns, qid->type, id);
+ return 0;
+}
+EXPORT_SYMBOL(qtree_get_next_id);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ed85d4f35c04..ca71bf881ad1 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -304,6 +304,11 @@ static int v2_free_file_info(struct super_block *sb, int type)
return 0;
}
+static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+}
+
static const struct quota_format_ops v2_format_ops = {
.check_quota_file = v2_check_quota_file,
.read_file_info = v2_read_file_info,
@@ -312,6 +317,7 @@ static const struct quota_format_ops v2_format_ops = {
.read_dqblk = v2_read_dquot,
.commit_dqblk = v2_write_dquot,
.release_dqblk = v2_release_dquot,
+ .get_next_id = v2_get_next_id,
};
static struct quota_format_type v2r0_quota_format = {
diff --git a/fs/read_write.c b/fs/read_write.c
index dadf24e5c95b..cf377cf9dfe3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -693,12 +693,17 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
EXPORT_SYMBOL(iov_shorten);
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, iter_fn_t fn)
+ loff_t *ppos, iter_fn_t fn, int flags)
{
struct kiocb kiocb;
ssize_t ret;
+ if (flags & ~RWF_HIPRI)
+ return -EOPNOTSUPP;
+
init_sync_kiocb(&kiocb, filp);
+ if (flags & RWF_HIPRI)
+ kiocb.ki_flags |= IOCB_HIPRI;
kiocb.ki_pos = *ppos;
ret = fn(&kiocb, iter);
@@ -709,10 +714,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, io_fn_t fn)
+ loff_t *ppos, io_fn_t fn, int flags)
{
ssize_t ret = 0;
+ if (flags & ~RWF_HIPRI)
+ return -EOPNOTSUPP;
+
while (iov_iter_count(iter)) {
struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
@@ -813,7 +821,8 @@ out:
static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
- unsigned long nr_segs, loff_t *pos)
+ unsigned long nr_segs, loff_t *pos,
+ int flags)
{
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
@@ -845,9 +854,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
}
if (iter_fn)
- ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
- ret = do_loop_readv_writev(file, &iter, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
if (type != READ)
file_end_write(file);
@@ -864,40 +873,40 @@ out:
}
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
- return do_readv_writev(READ, file, vec, vlen, pos);
+ return do_readv_writev(READ, file, vec, vlen, pos, flags);
}
EXPORT_SYMBOL(vfs_readv);
ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
- return do_readv_writev(WRITE, file, vec, vlen, pos);
+ return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
}
EXPORT_SYMBOL(vfs_writev);
-SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen)
+static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_readv(f.file, vec, vlen, &pos);
+ ret = vfs_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -909,15 +918,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
-SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen)
+static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_writev(f.file, vec, vlen, &pos);
+ ret = vfs_writev(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -935,10 +944,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}
-SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos, int flags)
{
- loff_t pos = pos_from_hilo(pos_h, pos_l);
struct fd f;
ssize_t ret = -EBADF;
@@ -949,7 +957,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_readv(f.file, vec, vlen, &pos);
+ ret = vfs_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
}
@@ -959,10 +967,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
-SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos, int flags)
{
- loff_t pos = pos_from_hilo(pos_h, pos_l);
struct fd f;
ssize_t ret = -EBADF;
@@ -973,7 +980,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_writev(f.file, vec, vlen, &pos);
+ ret = vfs_writev(f.file, vec, vlen, &pos, flags);
fdput(f);
}
@@ -983,11 +990,64 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen)
+{
+ return do_readv(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen)
+{
+ return do_writev(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ return do_preadv(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+ int, flags)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ if (pos == -1)
+ return do_readv(fd, vec, vlen, flags);
+
+ return do_preadv(fd, vec, vlen, pos, flags);
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ return do_pwritev(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+ int, flags)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ if (pos == -1)
+ return do_writev(fd, vec, vlen, flags);
+
+ return do_pwritev(fd, vec, vlen, pos, flags);
+}
+
#ifdef CONFIG_COMPAT
static ssize_t compat_do_readv_writev(int type, struct file *file,
const struct compat_iovec __user *uvector,
- unsigned long nr_segs, loff_t *pos)
+ unsigned long nr_segs, loff_t *pos,
+ int flags)
{
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
@@ -1019,9 +1079,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
}
if (iter_fn)
- ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
- ret = do_loop_readv_writev(file, &iter, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
if (type != READ)
file_end_write(file);
@@ -1039,7 +1099,7 @@ out:
static size_t compat_readv(struct file *file,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
ssize_t ret = -EBADF;
@@ -1050,7 +1110,7 @@ static size_t compat_readv(struct file *file,
if (!(file->f_mode & FMODE_CAN_READ))
goto out;
- ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+ ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
out:
if (ret > 0)
@@ -1059,9 +1119,9 @@ out:
return ret;
}
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen)
+static size_t do_compat_readv(compat_ulong_t fd,
+ const struct compat_iovec __user *vec,
+ compat_ulong_t vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
@@ -1070,16 +1130,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
if (!f.file)
return -EBADF;
pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos);
+ ret = compat_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
f.file->f_pos = pos;
fdput_pos(f);
return ret;
+
+}
+
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen)
+{
+ return do_compat_readv(fd, vec, vlen, 0);
}
-static long __compat_sys_preadv64(unsigned long fd,
+static long do_compat_preadv64(unsigned long fd,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos)
+ unsigned long vlen, loff_t pos, int flags)
{
struct fd f;
ssize_t ret;
@@ -1091,7 +1159,7 @@ static long __compat_sys_preadv64(unsigned long fd,
return -EBADF;
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos);
+ ret = compat_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
return ret;
}
@@ -1101,7 +1169,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos)
{
- return __compat_sys_preadv64(fd, vec, vlen, pos);
+ return do_compat_preadv64(fd, vec, vlen, pos, 0);
}
#endif
@@ -1111,12 +1179,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return __compat_sys_preadv64(fd, vec, vlen, pos);
+ return do_compat_preadv64(fd, vec, vlen, pos, 0);
+}
+
+COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
+ int, flags)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+ if (pos == -1)
+ return do_compat_readv(fd, vec, vlen, flags);
+
+ return do_compat_preadv64(fd, vec, vlen, pos, flags);
}
static size_t compat_writev(struct file *file,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
ssize_t ret = -EBADF;
@@ -1127,7 +1208,7 @@ static size_t compat_writev(struct file *file,
if (!(file->f_mode & FMODE_CAN_WRITE))
goto out;
- ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+ ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
out:
if (ret > 0)
@@ -1136,9 +1217,9 @@ out:
return ret;
}
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- const struct compat_iovec __user *, vec,
- compat_ulong_t, vlen)
+static size_t do_compat_writev(compat_ulong_t fd,
+ const struct compat_iovec __user* vec,
+ compat_ulong_t vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
@@ -1147,16 +1228,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
if (!f.file)
return -EBADF;
pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos);
+ ret = compat_writev(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
f.file->f_pos = pos;
fdput_pos(f);
return ret;
}
-static long __compat_sys_pwritev64(unsigned long fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+ const struct compat_iovec __user *, vec,
+ compat_ulong_t, vlen)
+{
+ return do_compat_writev(fd, vec, vlen, 0);
+}
+
+static long do_compat_pwritev64(unsigned long fd,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos)
+ unsigned long vlen, loff_t pos, int flags)
{
struct fd f;
ssize_t ret;
@@ -1168,7 +1256,7 @@ static long __compat_sys_pwritev64(unsigned long fd,
return -EBADF;
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos);
+ ret = compat_writev(f.file, vec, vlen, &pos, flags);
fdput(f);
return ret;
}
@@ -1178,7 +1266,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos)
{
- return __compat_sys_pwritev64(fd, vec, vlen, pos);
+ return do_compat_pwritev64(fd, vec, vlen, pos, 0);
}
#endif
@@ -1188,8 +1276,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return __compat_sys_pwritev64(fd, vec, vlen, pos);
+ return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+}
+
+COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+ if (pos == -1)
+ return do_compat_writev(fd, vec, vlen, flags);
+
+ return do_compat_pwritev64(fd, vec, vlen, pos, flags);
}
+
#endif
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c0306ec8ed7b..b8f2d1e8c645 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -802,6 +802,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
.write_info = reiserfs_write_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops reiserfs_qctl_operations = {
diff --git a/fs/select.c b/fs/select.c
index 79d0d4953cad..869293988c2a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-long select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec *tv)
{
- unsigned long ret;
+ u64 ret;
struct timespec now;
/*
@@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
- unsigned long slack = 0;
+ u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
@@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
- unsigned long slack = 0;
+ u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 82bc0d64fc38..9947b5c69664 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int spd_pages = spd->nr_pages;
int ret, do_wakeup, page_nr;
+ if (!spd_pages)
+ return 0;
+
ret = 0;
do_wakeup = 0;
page_nr = 0;
@@ -577,7 +580,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);
return res;
diff --git a/fs/super.c b/fs/super.c
index 1182af8fd5ff..74914b1bae70 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 2c6f0cb816b4..c54a24360f85 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -4,3 +4,4 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
+ubifs-y += misc.o
diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c
new file mode 100644
index 000000000000..486a2844949f
--- /dev/null
+++ b/fs/ubifs/misc.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include "ubifs.h"
+
+/* Normal UBIFS messages */
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_notice("UBIFS (ubi%d:%d): %pV\n",
+ c->vi.ubi_num, c->vi.vol_id, &vaf);
+
+ va_end(args);
+} \
+
+/* UBIFS error messages */
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("UBIFS error (ubi%d:%d pid %d): %ps: %pV\n",
+ c->vi.ubi_num, c->vi.vol_id, current->pid,
+ __builtin_return_address(0),
+ &vaf);
+
+ va_end(args);
+} \
+
+/* UBIFS warning messages */
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("UBIFS warning (ubi%d:%d pid %d): %ps: %pV\n",
+ c->vi.ubi_num, c->vi.vol_id, current->pid,
+ __builtin_return_address(0),
+ &vaf);
+
+ va_end(args);
+}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a5697de763f5..c2a57e193a81 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -42,30 +42,6 @@
/* Version of this UBIFS implementation */
#define UBIFS_VERSION 1
-/* Normal UBIFS messages */
-#define ubifs_msg(c, fmt, ...) \
- pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \
- (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
-/* UBIFS error messages */
-#define ubifs_err(c, fmt, ...) \
- pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \
- (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
- __func__, ##__VA_ARGS__)
-/* UBIFS warning messages */
-#define ubifs_warn(c, fmt, ...) \
- pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \
- (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
- __func__, ##__VA_ARGS__)
-/*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
- */
-#define ubifs_errc(c, fmt, ...) \
- do { \
- if (!(c)->probing) \
- ubifs_err(c, fmt, ##__VA_ARGS__); \
- } while (0)
-
/* UBIFS file system VFS magic number */
#define UBIFS_SUPER_MAGIC 0x24051905
@@ -1802,4 +1778,21 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
+/* Normal UBIFS messages */
+__printf(2, 3)
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...) \
+do { \
+ if (!(c)->probing) \
+ ubifs_err(c, fmt, ##__VA_ARGS__); \
+} while (0)
+
#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c7f4d434d098..b043e044121d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -59,7 +59,6 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
-#include <linux/posix_acl_xattr.h>
/*
* Limit the number of extended attributes per inode so that the total size
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 541d9c65014d..b51b371b874a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
int block, iblock;
loff_t nf_pos;
int flen;
- unsigned char *fname = NULL;
+ unsigned char *fname = NULL, *copy_name = NULL;
unsigned char *nameptr;
uint16_t liu;
uint8_t lfi;
@@ -143,7 +143,15 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
if (poffset >= lfi) {
nameptr = (char *)(fibh.ebh->b_data + poffset - lfi);
} else {
- nameptr = fname;
+ if (!copy_name) {
+ copy_name = kmalloc(UDF_NAME_LEN,
+ GFP_NOFS);
+ if (!copy_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ nameptr = copy_name;
memcpy(nameptr, fi->fileIdent + liu,
lfi - poffset);
memcpy(nameptr + lfi - poffset,
@@ -185,6 +193,7 @@ out:
brelse(fibh.sbh);
brelse(epos.bh);
kfree(fname);
+ kfree(copy_name);
return ret;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 42eafb91f7ff..a2ba11eca995 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -165,7 +165,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
struct fileIdentDesc *fi = NULL;
loff_t f_pos;
int block, flen;
- unsigned char *fname = NULL;
+ unsigned char *fname = NULL, *copy_name = NULL;
unsigned char *nameptr;
uint8_t lfi;
uint16_t liu;
@@ -236,7 +236,15 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
nameptr = (uint8_t *)(fibh->ebh->b_data +
poffset - lfi);
else {
- nameptr = fname;
+ if (!copy_name) {
+ copy_name = kmalloc(UDF_NAME_LEN,
+ GFP_NOFS);
+ if (!copy_name) {
+ fi = ERR_PTR(-ENOMEM);
+ goto out_err;
+ }
+ }
+ nameptr = copy_name;
memcpy(nameptr, fi->fileIdent + liu,
lfi - poffset);
memcpy(nameptr + lfi - poffset,
@@ -279,6 +287,7 @@ out_err:
out_ok:
brelse(epos.bh);
kfree(fname);
+ kfree(copy_name);
return fi;
}
@@ -291,7 +300,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
struct udf_fileident_bh fibh;
struct fileIdentDesc *fi;
- if (dentry->d_name.len > UDF_NAME_LEN - 2)
+ if (dentry->d_name.len > UDF_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
#ifdef UDF_RECOVERY
@@ -351,7 +360,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
struct udf_inode_info *dinfo;
fibh->sbh = fibh->ebh = NULL;
- name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
if (!name) {
*err = -ENOMEM;
goto out_err;
@@ -362,8 +371,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
*err = -EINVAL;
goto out_err;
}
- namelen = udf_put_filename(sb, dentry->d_name.name, name,
- dentry->d_name.len);
+ namelen = udf_put_filename(sb, dentry->d_name.name,
+ dentry->d_name.len,
+ name, UDF_NAME_LEN_CS0);
if (!namelen) {
*err = -ENAMETOOLONG;
goto out_err;
@@ -914,7 +924,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
iinfo = UDF_I(inode);
down_write(&iinfo->i_data_sem);
- name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
if (!name) {
err = -ENOMEM;
goto out_no_entry;
@@ -997,8 +1007,9 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
}
if (pc->componentType == 5) {
- namelen = udf_put_filename(sb, compstart, name,
- symname - compstart);
+ namelen = udf_put_filename(sb, compstart,
+ symname - compstart,
+ name, UDF_NAME_LEN_CS0);
if (!namelen)
goto out_no_entry;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index a522c15a0bfd..fa92fe839fda 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -887,18 +887,14 @@ static int udf_find_fileset(struct super_block *sb,
static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
struct primaryVolDesc *pvoldesc;
- struct ustr *instr, *outstr;
+ uint8_t *outstr;
struct buffer_head *bh;
uint16_t ident;
int ret = -ENOMEM;
- instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!instr)
- return -ENOMEM;
-
- outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+ outstr = kmalloc(128, GFP_NOFS);
if (!outstr)
- goto out1;
+ return -ENOMEM;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh) {
@@ -923,31 +919,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
#endif
}
- if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) {
- ret = udf_CS0toUTF8(outstr, instr);
- if (ret < 0)
- goto out_bh;
+ ret = udf_CS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
+ if (ret < 0)
+ goto out_bh;
- strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
- outstr->u_len > 31 ? 31 : outstr->u_len);
- udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
- }
+ strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
+ udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
- if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) {
- ret = udf_CS0toUTF8(outstr, instr);
- if (ret < 0)
- goto out_bh;
+ ret = udf_CS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
+ if (ret < 0)
+ goto out_bh;
- udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
- }
+ outstr[ret] = 0;
+ udf_debug("volSetIdent[] = '%s'\n", outstr);
ret = 0;
out_bh:
brelse(bh);
out2:
kfree(outstr);
-out1:
- kfree(instr);
return ret;
}
@@ -2358,7 +2348,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
le32_to_cpu(lvidiu->numDirs)) : 0)
+ buf->f_bfree;
buf->f_ffree = buf->f_bfree;
- buf->f_namelen = UDF_NAME_LEN - 2;
+ buf->f_namelen = UDF_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index fa0044b6b81d..972b70625614 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -49,8 +49,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb,
#define UDF_EXTENT_FLAG_MASK 0xC0000000
#define UDF_NAME_PAD 4
-#define UDF_NAME_LEN 256
-#define UDF_PATH_LEN 1023
+#define UDF_NAME_LEN 254
+#define UDF_NAME_LEN_CS0 255
static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
{
@@ -106,12 +106,6 @@ struct generic_desc {
__le32 volDescSeqNum;
};
-struct ustr {
- uint8_t u_cmpID;
- uint8_t u_name[UDF_NAME_LEN - 2];
- uint8_t u_len;
-};
-
/* super.c */
@@ -214,12 +208,11 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
}
/* unicode.c */
-extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
- int);
-extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
- int);
-extern int udf_build_ustr(struct ustr *, dstring *, int);
-extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
+extern int udf_get_filename(struct super_block *, const uint8_t *, int,
+ uint8_t *, int);
+extern int udf_put_filename(struct super_block *, const uint8_t *, int,
+ uint8_t *, int);
+extern int udf_CS0toUTF8(uint8_t *, int, const uint8_t *, int);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index e788a05aab83..3ff42f4437f3 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,199 +28,72 @@
#include "udf_sb.h"
-static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
- int);
-
-static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
-{
- if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
- return 0;
-
- memset(dest, 0, sizeof(struct ustr));
- memcpy(dest->u_name, src, strlen);
- dest->u_cmpID = 0x08;
- dest->u_len = strlen;
-
- return strlen;
-}
-
-/*
- * udf_build_ustr
- */
-int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
-{
- int usesize;
-
- if (!dest || !ptr || !size)
- return -1;
- BUG_ON(size < 2);
-
- usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
- usesize = min(usesize, size - 2);
- dest->u_cmpID = ptr[0];
- dest->u_len = usesize;
- memcpy(dest->u_name, ptr + 1, usesize);
- memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
-
- return 0;
-}
-
-/*
- * udf_build_ustr_exact
- */
-static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
-{
- memset(dest, 0, sizeof(struct ustr));
- dest->u_cmpID = ptr[0];
- dest->u_len = exactsize - 1;
- memcpy(dest->u_name, ptr + 1, exactsize - 1);
-}
-
-/*
- * udf_CS0toUTF8
- *
- * PURPOSE
- * Convert OSTA Compressed Unicode to the UTF-8 equivalent.
- *
- * PRE-CONDITIONS
- * utf Pointer to UTF-8 output buffer.
- * ocu Pointer to OSTA Compressed Unicode input buffer
- * of size UDF_NAME_LEN bytes.
- * both of type "struct ustr *"
- *
- * POST-CONDITIONS
- * <return> >= 0 on success.
- *
- * HISTORY
- * November 12, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
+static int udf_uni2char_utf8(wchar_t uni,
+ unsigned char *out,
+ int boundlen)
{
- const uint8_t *ocu;
- uint8_t cmp_id, ocu_len;
- int i;
-
- ocu_len = ocu_i->u_len;
- if (ocu_len == 0) {
- memset(utf_o, 0, sizeof(struct ustr));
- return 0;
- }
-
- cmp_id = ocu_i->u_cmpID;
- if (cmp_id != 8 && cmp_id != 16) {
- memset(utf_o, 0, sizeof(struct ustr));
- pr_err("unknown compression code (%d) stri=%s\n",
- cmp_id, ocu_i->u_name);
- return -EINVAL;
- }
-
- ocu = ocu_i->u_name;
- utf_o->u_len = 0;
- for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
-
- /* Expand OSTA compressed Unicode to Unicode */
- uint32_t c = ocu[i++];
- if (cmp_id == 16)
- c = (c << 8) | ocu[i++];
-
- /* Compress Unicode to UTF-8 */
- if (c < 0x80U)
- utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
- else if (c < 0x800U) {
- if (utf_o->u_len > (UDF_NAME_LEN - 4))
- break;
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0xc0 | (c >> 6));
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0x80 | (c & 0x3f));
- } else {
- if (utf_o->u_len > (UDF_NAME_LEN - 5))
- break;
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0xe0 | (c >> 12));
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0x80 |
- ((c >> 6) & 0x3f));
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0x80 | (c & 0x3f));
- }
+ int u_len = 0;
+
+ if (boundlen <= 0)
+ return -ENAMETOOLONG;
+
+ if (uni < 0x80) {
+ out[u_len++] = (unsigned char)uni;
+ } else if (uni < 0x800) {
+ if (boundlen < 2)
+ return -ENAMETOOLONG;
+ out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
+ out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
+ } else {
+ if (boundlen < 3)
+ return -ENAMETOOLONG;
+ out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
+ out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
+ out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
}
- utf_o->u_cmpID = 8;
-
- return utf_o->u_len;
+ return u_len;
}
-/*
- *
- * udf_UTF8toCS0
- *
- * PURPOSE
- * Convert UTF-8 to the OSTA Compressed Unicode equivalent.
- *
- * DESCRIPTION
- * This routine is only called by udf_lookup().
- *
- * PRE-CONDITIONS
- * ocu Pointer to OSTA Compressed Unicode output
- * buffer of size UDF_NAME_LEN bytes.
- * utf Pointer to UTF-8 input buffer.
- * utf_len Length of UTF-8 input buffer in bytes.
- *
- * POST-CONDITIONS
- * <return> Zero on success.
- *
- * HISTORY
- * November 12, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
+static int udf_char2uni_utf8(const unsigned char *in,
+ int boundlen,
+ wchar_t *uni)
{
- unsigned c, i, max_val, utf_char;
- int utf_cnt, u_len, u_ch;
-
- memset(ocu, 0, sizeof(dstring) * length);
- ocu[0] = 8;
- max_val = 0xffU;
- u_ch = 1;
+ unsigned int utf_char;
+ unsigned char c;
+ int utf_cnt, u_len;
-try_again:
- u_len = 0U;
- utf_char = 0U;
- utf_cnt = 0U;
- for (i = 0U; i < utf->u_len; i++) {
- /* Name didn't fit? */
- if (u_len + 1 + u_ch >= length)
- return 0;
-
- c = (uint8_t)utf->u_name[i];
+ utf_char = 0;
+ utf_cnt = 0;
+ for (u_len = 0; u_len < boundlen;) {
+ c = in[u_len++];
/* Complete a multi-byte UTF-8 character */
if (utf_cnt) {
- utf_char = (utf_char << 6) | (c & 0x3fU);
+ utf_char = (utf_char << 6) | (c & 0x3f);
if (--utf_cnt)
continue;
} else {
/* Check for a multi-byte UTF-8 character */
- if (c & 0x80U) {
+ if (c & 0x80) {
/* Start a multi-byte UTF-8 character */
- if ((c & 0xe0U) == 0xc0U) {
- utf_char = c & 0x1fU;
+ if ((c & 0xe0) == 0xc0) {
+ utf_char = c & 0x1f;
utf_cnt = 1;
- } else if ((c & 0xf0U) == 0xe0U) {
- utf_char = c & 0x0fU;
+ } else if ((c & 0xf0) == 0xe0) {
+ utf_char = c & 0x0f;
utf_cnt = 2;
- } else if ((c & 0xf8U) == 0xf0U) {
- utf_char = c & 0x07U;
+ } else if ((c & 0xf8) == 0xf0) {
+ utf_char = c & 0x07;
utf_cnt = 3;
- } else if ((c & 0xfcU) == 0xf8U) {
- utf_char = c & 0x03U;
+ } else if ((c & 0xfc) == 0xf8) {
+ utf_char = c & 0x03;
utf_cnt = 4;
- } else if ((c & 0xfeU) == 0xfcU) {
- utf_char = c & 0x01U;
+ } else if ((c & 0xfe) == 0xfc) {
+ utf_char = c & 0x01;
utf_cnt = 5;
} else {
- goto error_out;
+ utf_cnt = -1;
+ break;
}
continue;
} else {
@@ -228,97 +101,216 @@ try_again:
utf_char = c;
}
}
-
- /* Choose no compression if necessary */
- if (utf_char > max_val) {
- if (max_val == 0xffU) {
- max_val = 0xffffU;
- ocu[0] = (uint8_t)0x10U;
- u_ch = 2;
- goto try_again;
- }
- goto error_out;
- }
-
- if (max_val == 0xffffU)
- ocu[++u_len] = (uint8_t)(utf_char >> 8);
- ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
+ *uni = utf_char;
+ break;
}
-
if (utf_cnt) {
-error_out:
- ocu[++u_len] = '?';
- printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
+ *uni = '?';
+ return -EINVAL;
}
+ return u_len;
+}
- ocu[length - 1] = (uint8_t)u_len + 1;
+#define ILLEGAL_CHAR_MARK '_'
+#define EXT_MARK '.'
+#define CRC_MARK '#'
+#define EXT_SIZE 5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN 5
+
+static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
+ int *str_o_idx,
+ const uint8_t *str_i, int str_i_max_len,
+ int *str_i_idx,
+ int u_ch, int *needsCRC,
+ int (*conv_f)(wchar_t, unsigned char *, int),
+ int translate)
+{
+ uint32_t c;
+ int illChar = 0;
+ int len, gotch = 0;
+
+ for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
+ if (*str_o_idx >= str_o_max_len) {
+ *needsCRC = 1;
+ return gotch;
+ }
- return u_len + 1;
+ /* Expand OSTA compressed Unicode to Unicode */
+ c = str_i[*str_i_idx];
+ if (u_ch > 1)
+ c = (c << 8) | str_i[*str_i_idx + 1];
+
+ if (translate && (c == '/' || c == 0))
+ illChar = 1;
+ else if (illChar)
+ break;
+ else
+ gotch = 1;
+ }
+ if (illChar) {
+ *needsCRC = 1;
+ c = ILLEGAL_CHAR_MARK;
+ gotch = 1;
+ }
+ if (gotch) {
+ len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
+ /* Valid character? */
+ if (len >= 0)
+ *str_o_idx += len;
+ else if (len == -ENAMETOOLONG) {
+ *needsCRC = 1;
+ gotch = 0;
+ } else {
+ str_o[(*str_o_idx)++] = '?';
+ *needsCRC = 1;
+ }
+ }
+ return gotch;
}
-static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
- const struct ustr *ocu_i)
+static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
+ const uint8_t *ocu, int ocu_len,
+ int (*conv_f)(wchar_t, unsigned char *, int),
+ int translate)
{
- const uint8_t *ocu;
- uint8_t cmp_id, ocu_len;
- int i, len;
+ uint32_t c;
+ uint8_t cmp_id;
+ int idx, len;
+ int u_ch;
+ int needsCRC = 0;
+ int ext_i_len, ext_max_len;
+ int str_o_len = 0; /* Length of resulting output */
+ int ext_o_len = 0; /* Extension output length */
+ int ext_crc_len = 0; /* Extension output length if used with CRC */
+ int i_ext = -1; /* Extension position in input buffer */
+ int o_crc = 0; /* Rightmost possible output pos for CRC+ext */
+ unsigned short valueCRC;
+ uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
+ uint8_t crc[CRC_LEN];
+ if (str_max_len <= 0)
+ return 0;
- ocu_len = ocu_i->u_len;
if (ocu_len == 0) {
- memset(utf_o, 0, sizeof(struct ustr));
+ memset(str_o, 0, str_max_len);
return 0;
}
- cmp_id = ocu_i->u_cmpID;
+ cmp_id = ocu[0];
if (cmp_id != 8 && cmp_id != 16) {
- memset(utf_o, 0, sizeof(struct ustr));
- pr_err("unknown compression code (%d) stri=%s\n",
- cmp_id, ocu_i->u_name);
+ memset(str_o, 0, str_max_len);
+ pr_err("unknown compression code (%d)\n", cmp_id);
return -EINVAL;
}
+ u_ch = cmp_id >> 3;
- ocu = ocu_i->u_name;
- utf_o->u_len = 0;
- for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
- /* Expand OSTA compressed Unicode to Unicode */
- uint32_t c = ocu[i++];
- if (cmp_id == 16)
- c = (c << 8) | ocu[i++];
+ ocu++;
+ ocu_len--;
- len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - 2 - utf_o->u_len);
- /* Valid character? */
- if (len >= 0)
- utf_o->u_len += len;
- else
- utf_o->u_name[utf_o->u_len++] = '?';
+ if (ocu_len % u_ch) {
+ pr_err("incorrect filename length (%d)\n", ocu_len + 1);
+ return -EINVAL;
+ }
+
+ if (translate) {
+ /* Look for extension */
+ for (idx = ocu_len - u_ch, ext_i_len = 0;
+ (idx >= 0) && (ext_i_len < EXT_SIZE);
+ idx -= u_ch, ext_i_len++) {
+ c = ocu[idx];
+ if (u_ch > 1)
+ c = (c << 8) | ocu[idx + 1];
+
+ if (c == EXT_MARK) {
+ if (ext_i_len)
+ i_ext = idx;
+ break;
+ }
+ }
+ if (i_ext >= 0) {
+ /* Convert extension */
+ ext_max_len = min_t(int, sizeof(ext), str_max_len);
+ ext[ext_o_len++] = EXT_MARK;
+ idx = i_ext + u_ch;
+ while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
+ ocu, ocu_len, &idx,
+ u_ch, &needsCRC,
+ conv_f, translate)) {
+ if ((ext_o_len + CRC_LEN) < str_max_len)
+ ext_crc_len = ext_o_len;
+ }
+ }
}
- utf_o->u_cmpID = 8;
- return utf_o->u_len;
+ idx = 0;
+ while (1) {
+ if (translate && (idx == i_ext)) {
+ if (str_o_len > (str_max_len - ext_o_len))
+ needsCRC = 1;
+ break;
+ }
+
+ if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
+ ocu, ocu_len, &idx,
+ u_ch, &needsCRC, conv_f, translate))
+ break;
+
+ if (translate &&
+ (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
+ o_crc = str_o_len;
+ }
+
+ if (translate) {
+ if (str_o_len <= 2 && str_o[0] == '.' &&
+ (str_o_len == 1 || str_o[1] == '.'))
+ needsCRC = 1;
+ if (needsCRC) {
+ str_o_len = o_crc;
+ valueCRC = crc_itu_t(0, ocu, ocu_len);
+ crc[0] = CRC_MARK;
+ crc[1] = hex_asc_upper_hi(valueCRC >> 8);
+ crc[2] = hex_asc_upper_lo(valueCRC >> 8);
+ crc[3] = hex_asc_upper_hi(valueCRC);
+ crc[4] = hex_asc_upper_lo(valueCRC);
+ len = min_t(int, CRC_LEN, str_max_len - str_o_len);
+ memcpy(&str_o[str_o_len], crc, len);
+ str_o_len += len;
+ ext_o_len = ext_crc_len;
+ }
+ if (ext_o_len > 0) {
+ memcpy(&str_o[str_o_len], ext, ext_o_len);
+ str_o_len += ext_o_len;
+ }
+ }
+
+ return str_o_len;
}
-static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
- int length)
+static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
+ const uint8_t *str_i, int str_len,
+ int (*conv_f)(const unsigned char *, int, wchar_t *))
{
- int len;
- unsigned i, max_val;
- uint16_t uni_char;
+ int i, len;
+ unsigned int max_val;
+ wchar_t uni_char;
int u_len, u_ch;
- memset(ocu, 0, sizeof(dstring) * length);
+ if (ocu_max_len <= 0)
+ return 0;
+
+ memset(ocu, 0, ocu_max_len);
ocu[0] = 8;
- max_val = 0xffU;
+ max_val = 0xff;
u_ch = 1;
try_again:
- u_len = 0U;
- for (i = 0U; i < uni->u_len; i++) {
+ u_len = 1;
+ for (i = 0; i < str_len; i++) {
/* Name didn't fit? */
- if (u_len + 1 + u_ch >= length)
+ if (u_len + u_ch > ocu_max_len)
return 0;
- len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
+ len = conv_f(&str_i[i], str_len - i, &uni_char);
if (!len)
continue;
/* Invalid character, deal with it */
@@ -328,187 +320,65 @@ try_again:
}
if (uni_char > max_val) {
- max_val = 0xffffU;
- ocu[0] = (uint8_t)0x10U;
+ max_val = 0xffff;
+ ocu[0] = 0x10;
u_ch = 2;
goto try_again;
}
- if (max_val == 0xffffU)
- ocu[++u_len] = (uint8_t)(uni_char >> 8);
- ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
+ if (max_val == 0xffff)
+ ocu[u_len++] = (uint8_t)(uni_char >> 8);
+ ocu[u_len++] = (uint8_t)(uni_char & 0xff);
i += len - 1;
}
- ocu[length - 1] = (uint8_t)u_len + 1;
- return u_len + 1;
+ return u_len;
}
-int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
+int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
+{
+ return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
+ udf_uni2char_utf8, 0);
+}
+
+int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
uint8_t *dname, int dlen)
{
- struct ustr *filename, *unifilename;
+ int (*conv_f)(wchar_t, unsigned char *, int);
int ret;
if (!slen)
return -EIO;
- filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!filename)
- return -ENOMEM;
-
- unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!unifilename) {
- ret = -ENOMEM;
- goto out1;
- }
+ if (dlen <= 0)
+ return 0;
- udf_build_ustr_exact(unifilename, sname, slen);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- ret = udf_CS0toUTF8(filename, unifilename);
- if (ret < 0) {
- udf_debug("Failed in udf_get_filename: sname = %s\n",
- sname);
- goto out2;
- }
+ conv_f = udf_uni2char_utf8;
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
- unifilename);
- if (ret < 0) {
- udf_debug("Failed in udf_get_filename: sname = %s\n",
- sname);
- goto out2;
- }
+ conv_f = UDF_SB(sb)->s_nls_map->uni2char;
} else
BUG();
- ret = udf_translate_to_linux(dname, dlen,
- filename->u_name, filename->u_len,
- unifilename->u_name, unifilename->u_len);
+ ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
/* Zero length filename isn't valid... */
if (ret == 0)
ret = -EINVAL;
-out2:
- kfree(unifilename);
-out1:
- kfree(filename);
return ret;
}
-int udf_put_filename(struct super_block *sb, const uint8_t *sname,
- uint8_t *dname, int flen)
+int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
+ uint8_t *dname, int dlen)
{
- struct ustr unifilename;
- int namelen;
-
- if (!udf_char_to_ustr(&unifilename, sname, flen))
- return 0;
+ int (*conv_f)(const unsigned char *, int, wchar_t *);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
- if (!namelen)
- return 0;
+ conv_f = udf_char2uni_utf8;
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
- &unifilename, UDF_NAME_LEN);
- if (!namelen)
- return 0;
+ conv_f = UDF_SB(sb)->s_nls_map->char2uni;
} else
- return 0;
+ BUG();
- return namelen;
+ return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
}
-#define ILLEGAL_CHAR_MARK '_'
-#define EXT_MARK '.'
-#define CRC_MARK '#'
-#define EXT_SIZE 5
-/* Number of chars we need to store generated CRC to make filename unique */
-#define CRC_LEN 5
-
-static int udf_translate_to_linux(uint8_t *newName, int newLen,
- uint8_t *udfName, int udfLen,
- uint8_t *fidName, int fidNameLen)
-{
- int index, newIndex = 0, needsCRC = 0;
- int extIndex = 0, newExtIndex = 0, hasExt = 0;
- unsigned short valueCRC;
- uint8_t curr;
-
- if (udfName[0] == '.' &&
- (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
- needsCRC = 1;
- newIndex = udfLen;
- memcpy(newName, udfName, udfLen);
- } else {
- for (index = 0; index < udfLen; index++) {
- curr = udfName[index];
- if (curr == '/' || curr == 0) {
- needsCRC = 1;
- curr = ILLEGAL_CHAR_MARK;
- while (index + 1 < udfLen &&
- (udfName[index + 1] == '/' ||
- udfName[index + 1] == 0))
- index++;
- }
- if (curr == EXT_MARK &&
- (udfLen - index - 1) <= EXT_SIZE) {
- if (udfLen == index + 1)
- hasExt = 0;
- else {
- hasExt = 1;
- extIndex = index;
- newExtIndex = newIndex;
- }
- }
- if (newIndex < newLen)
- newName[newIndex++] = curr;
- else
- needsCRC = 1;
- }
- }
- if (needsCRC) {
- uint8_t ext[EXT_SIZE];
- int localExtIndex = 0;
-
- if (hasExt) {
- int maxFilenameLen;
- for (index = 0;
- index < EXT_SIZE && extIndex + index + 1 < udfLen;
- index++) {
- curr = udfName[extIndex + index + 1];
-
- if (curr == '/' || curr == 0) {
- needsCRC = 1;
- curr = ILLEGAL_CHAR_MARK;
- while (extIndex + index + 2 < udfLen &&
- (index + 1 < EXT_SIZE &&
- (udfName[extIndex + index + 2] == '/' ||
- udfName[extIndex + index + 2] == 0)))
- index++;
- }
- ext[localExtIndex++] = curr;
- }
- maxFilenameLen = newLen - CRC_LEN - localExtIndex;
- if (newIndex > maxFilenameLen)
- newIndex = maxFilenameLen;
- else
- newIndex = newExtIndex;
- } else if (newIndex > newLen - CRC_LEN)
- newIndex = newLen - CRC_LEN;
- newName[newIndex++] = CRC_MARK;
- valueCRC = crc_itu_t(0, fidName, fidNameLen);
- newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
- newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
- newName[newIndex++] = hex_asc_upper_hi(valueCRC);
- newName[newIndex++] = hex_asc_upper_lo(valueCRC);
-
- if (hasExt) {
- newName[newIndex++] = EXT_MARK;
- for (index = 0; index < localExtIndex; index++)
- newName[newIndex++] = ext[index];
- }
- }
-
- return newIndex;
-}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 50311703135b..66cdb44616d5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
goto out;
/*
+ * We don't do userfault handling for the final child pid update.
+ */
+ if (current->flags & PF_EXITING)
+ goto out;
+
+ /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f64639176670..3542d94fddce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,4 +121,5 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
+xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
+xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 444626ddbd1b..d9b42425291e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -118,8 +118,6 @@ xfs_allocbt_free_block(
xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
- xfs_trans_binval(cur->bc_tp, bp);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..90928bbe693c 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -24,22 +24,6 @@
* Small attribute lists are packed as tightly as possible so as
* to fit into the literal area of the inode.
*/
-
-/*
- * Entries are packed toward the top as tight as possible.
- */
-typedef struct xfs_attr_shortform {
- struct xfs_attr_sf_hdr { /* constant-structure header block */
- __be16 totsize; /* total bytes in shortform list */
- __u8 count; /* count of active entries */
- } hdr;
- struct xfs_attr_sf_entry {
- __uint8_t namelen; /* actual length of name (no NULL) */
- __uint8_t valuelen; /* actual length of value (no NULL) */
- __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- __uint8_t nameval[1]; /* name & value bytes concatenated */
- } list[1]; /* variable sized array */
-} xfs_attr_shortform_t;
typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ef00156f4f96..041b6948aecc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -477,10 +477,7 @@ xfs_bmap_check_leaf_extents(
}
block = XFS_BUF_TO_BLOCK(bp);
}
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
+
return;
error0:
@@ -912,7 +909,7 @@ xfs_bmap_local_to_extents(
* We don't want to deal with the case of keeping inode data inline yet.
* So sending the data fork of a regular inode is invalid.
*/
- ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
@@ -1079,7 +1076,7 @@ xfs_bmap_add_attrfork_local(
if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
return 0;
- if (S_ISDIR(ip->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
memset(&dargs, 0, sizeof(dargs));
dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
@@ -1091,7 +1088,7 @@ xfs_bmap_add_attrfork_local(
return xfs_dir2_sf_to_block(&dargs);
}
- if (S_ISLNK(ip->i_d.di_mode))
+ if (S_ISLNK(VFS_I(ip)->i_mode))
return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
flags, XFS_DATA_FORK,
xfs_symlink_local_to_remote);
@@ -4721,6 +4718,66 @@ error0:
}
/*
+ * When a delalloc extent is split (e.g., due to a hole punch), the original
+ * indlen reservation must be shared across the two new extents that are left
+ * behind.
+ *
+ * Given the original reservation and the worst case indlen for the two new
+ * extents (as calculated by xfs_bmap_worst_indlen()), split the original
+ * reservation fairly across the two new extents. If necessary, steal available
+ * blocks from a deleted extent to make up a reservation deficiency (e.g., if
+ * ores == 1). The number of stolen blocks is returned. The availability and
+ * subsequent accounting of stolen blocks is the responsibility of the caller.
+ */
+static xfs_filblks_t
+xfs_bmap_split_indlen(
+ xfs_filblks_t ores, /* original res. */
+ xfs_filblks_t *indlen1, /* ext1 worst indlen */
+ xfs_filblks_t *indlen2, /* ext2 worst indlen */
+ xfs_filblks_t avail) /* stealable blocks */
+{
+ xfs_filblks_t len1 = *indlen1;
+ xfs_filblks_t len2 = *indlen2;
+ xfs_filblks_t nres = len1 + len2; /* new total res. */
+ xfs_filblks_t stolen = 0;
+
+ /*
+ * Steal as many blocks as we can to try and satisfy the worst case
+ * indlen for both new extents.
+ */
+ while (nres > ores && avail) {
+ nres--;
+ avail--;
+ stolen++;
+ }
+
+ /*
+ * The only blocks available are those reserved for the original
+ * extent and what we can steal from the extent being removed.
+ * If this still isn't enough to satisfy the combined
+ * requirements for the two new extents, skim blocks off of each
+ * of the new reservations until they match what is available.
+ */
+ while (nres > ores) {
+ if (len1) {
+ len1--;
+ nres--;
+ }
+ if (nres == ores)
+ break;
+ if (len2) {
+ len2--;
+ nres--;
+ }
+ }
+
+ *indlen1 = len1;
+ *indlen2 = len2;
+
+ return stolen;
+}
+
+/*
* Called by xfs_bmapi to update file extent records and the btree
* after removing space (or undoing a delayed allocation).
*/
@@ -4984,28 +5041,29 @@ xfs_bmap_del_extent(
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
} else {
+ xfs_filblks_t stolen;
ASSERT(whichfork == XFS_DATA_FORK);
- temp = xfs_bmap_worst_indlen(ip, temp);
+
+ /*
+ * Distribute the original indlen reservation across the
+ * two new extents. Steal blocks from the deleted extent
+ * if necessary. Stealing blocks simply fudges the
+ * fdblocks accounting in xfs_bunmapi().
+ */
+ temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
+ temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+ stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
+ del->br_blockcount);
+ da_new = temp + temp2 - stolen;
+ del->br_blockcount -= stolen;
+
+ /*
+ * Set the reservation for each extent. Warn if either
+ * is zero as this can lead to delalloc problems.
+ */
+ WARN_ON_ONCE(!temp || !temp2);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
new.br_startblock = nullstartblock((int)temp2);
- da_new = temp + temp2;
- while (da_new > da_old) {
- if (temp) {
- temp--;
- da_new--;
- xfs_bmbt_set_startblock(ep,
- nullstartblock((int)temp));
- }
- if (da_new == da_old)
- break;
- if (temp2) {
- temp2--;
- da_new--;
- new.br_startblock =
- nullstartblock((int)temp2);
- }
- }
}
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
xfs_iext_insert(ip, *idx + 1, 1, &new, state);
@@ -5210,7 +5268,7 @@ xfs_bunmapi(
* This is better than zeroing it.
*/
ASSERT(del.br_state == XFS_EXT_NORM);
- ASSERT(xfs_trans_get_block_res(tp) > 0);
+ ASSERT(tp->t_blk_res > 0);
/*
* If this spans a realtime extent boundary,
* chop it back to the start of the one we end at.
@@ -5241,7 +5299,7 @@ xfs_bunmapi(
del.br_startblock += mod;
} else if ((del.br_startoff == start &&
(del.br_state == XFS_EXT_UNWRITTEN ||
- xfs_trans_get_block_res(tp) == 0)) ||
+ tp->t_blk_res == 0)) ||
!xfs_sb_version_hasextflgbit(&mp->m_sb)) {
/*
* Can't make it unwritten. There isn't
@@ -5296,9 +5354,37 @@ xfs_bunmapi(
goto nodelete;
}
}
+
+ /*
+ * If it's the case where the directory code is running
+ * with no block reservation, and the deleted block is in
+ * the middle of its extent, and the resulting insert
+ * of an extent would cause transformation to btree format,
+ * then reject it. The calling code will then swap
+ * blocks around instead.
+ * We have to do this now, rather than waiting for the
+ * conversion to btree format, since the transaction
+ * will be dirty.
+ */
+ if (!wasdel && tp->t_blk_res == 0 &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
+ del.br_startoff > got.br_startoff &&
+ del.br_startoff + del.br_blockcount <
+ got.br_startoff + got.br_blockcount) {
+ error = -ENOSPC;
+ goto error0;
+ }
+
+ /*
+ * Unreserve quota and update realtime free space, if
+ * appropriate. If delayed allocation, update the inode delalloc
+ * counter now and wait to update the sb counters as
+ * xfs_bmap_del_extent() might need to borrow some blocks.
+ */
if (wasdel) {
ASSERT(startblockval(del.br_startblock) > 0);
- /* Update realtime/data freespace, unreserve quota */
if (isrt) {
xfs_filblks_t rtexts;
@@ -5309,8 +5395,6 @@ xfs_bunmapi(
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_RTBLKS);
} else {
- xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
- false);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_REGBLKS);
@@ -5321,32 +5405,16 @@ xfs_bunmapi(
XFS_BTCUR_BPRV_WASDEL;
} else if (cur)
cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
- /*
- * If it's the case where the directory code is running
- * with no block reservation, and the deleted block is in
- * the middle of its extent, and the resulting insert
- * of an extent would cause transformation to btree format,
- * then reject it. The calling code will then swap
- * blocks around instead.
- * We have to do this now, rather than waiting for the
- * conversion to btree format, since the transaction
- * will be dirty.
- */
- if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
- XFS_IFORK_MAXEXT(ip, whichfork) &&
- del.br_startoff > got.br_startoff &&
- del.br_startoff + del.br_blockcount <
- got.br_startoff + got.br_blockcount) {
- error = -ENOSPC;
- goto error0;
- }
+
error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
+
+ if (!isrt && wasdel)
+ xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+
bno = del.br_startoff - 1;
nodelete:
/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 1637c37bfbaa..6282f6e708af 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -461,7 +461,7 @@ xfs_bmbt_alloc_block(
* reservation amount is insufficient then we may fail a
* block allocation here and corrupt the filesystem.
*/
- args.minleft = xfs_trans_get_block_res(args.tp);
+ args.minleft = args.tp->t_blk_res;
} else if (cur->bc_private.b.flist->xbf_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -470,7 +470,7 @@ xfs_bmbt_alloc_block(
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
- if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+ if (!args.wasdel && args.tp->t_blk_res == 0) {
error = -ENOSPC;
goto error0;
}
@@ -531,7 +531,6 @@ xfs_bmbt_free_block(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(tp, bp);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a0eb18ce3ad3..1f88e1ce770f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -294,6 +294,21 @@ xfs_btree_sblock_verify_crc(
return true;
}
+static int
+xfs_btree_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ int error;
+
+ error = cur->bc_ops->free_block(cur, bp);
+ if (!error) {
+ xfs_trans_binval(cur->bc_tp, bp);
+ XFS_BTREE_STATS_INC(cur, free);
+ }
+ return error;
+}
+
/*
* Delete the btree cursor.
*/
@@ -3209,6 +3224,7 @@ xfs_btree_kill_iroot(
int level;
int index;
int numrecs;
+ int error;
#ifdef DEBUG
union xfs_btree_ptr ptr;
int i;
@@ -3272,8 +3288,6 @@ xfs_btree_kill_iroot(
cpp = xfs_btree_ptr_addr(cur, 1, cblock);
#ifdef DEBUG
for (i = 0; i < numrecs; i++) {
- int error;
-
error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
if (error) {
XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
@@ -3283,8 +3297,11 @@ xfs_btree_kill_iroot(
#endif
xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
- cur->bc_ops->free_block(cur, cbp);
- XFS_BTREE_STATS_INC(cur, free);
+ error = xfs_btree_free_block(cur, cbp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
cur->bc_bufs[level - 1] = NULL;
be16_add_cpu(&block->bb_level, -1);
@@ -3317,14 +3334,12 @@ xfs_btree_kill_root(
*/
cur->bc_ops->set_root(cur, newroot, -1);
- error = cur->bc_ops->free_block(cur, bp);
+ error = xfs_btree_free_block(cur, bp);
if (error) {
XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
- XFS_BTREE_STATS_INC(cur, free);
-
cur->bc_bufs[level] = NULL;
cur->bc_ra[level] = 0;
cur->bc_nlevels--;
@@ -3830,10 +3845,9 @@ xfs_btree_delrec(
}
/* Free the deleted block. */
- error = cur->bc_ops->free_block(cur, rbp);
+ error = xfs_btree_free_block(cur, rbp);
if (error)
goto error0;
- XFS_BTREE_STATS_INC(cur, free);
/*
* If we joined with the left neighbor, set the buffer in the
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b14bbd6bb05f..8d4d8bce41bf 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -641,6 +641,22 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
*/
#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+ struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ } hdr;
+ struct xfs_attr_sf_entry {
+ __uint8_t namelen; /* actual length of name (no NULL) */
+ __uint8_t valuelen; /* actual length of value (no NULL) */
+ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ __uint8_t nameval[1]; /* name & value bytes concatenated */
+ } list[1]; /* variable sized array */
+} xfs_attr_shortform_t;
+
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
__be16 size; /* length of free region */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 2fb53a5c0a74..af0f9d171f8a 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -176,7 +176,7 @@ xfs_dir_isempty(
{
xfs_dir2_sf_hdr_t *sfp;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (dp->i_d.di_size == 0) /* might happen during shutdown. */
return 1;
if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -231,7 +231,7 @@ xfs_dir_init(
struct xfs_da_args *args;
int error;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
if (error)
return error;
@@ -266,7 +266,7 @@ xfs_dir_createname(
int rval;
int v; /* type-checking value */
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (inum) {
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
@@ -364,7 +364,7 @@ xfs_dir_lookup(
int v; /* type-checking value */
int lock_mode;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
/*
@@ -443,7 +443,7 @@ xfs_dir_removename(
int rval;
int v; /* type-checking value */
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -505,7 +505,7 @@ xfs_dir_replace(
int rval;
int v; /* type-checking value */
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 63ee03db796c..75a557432d0f 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -2235,6 +2235,9 @@ xfs_dir2_node_trim_free(
dp = args->dp;
tp = args->trans;
+
+ *rvalp = 0;
+
/*
* Read the freespace block.
*/
@@ -2255,7 +2258,6 @@ xfs_dir2_node_trim_free(
*/
if (freehdr.nused > 0) {
xfs_trans_brelse(tp, bp);
- *rvalp = 0;
return 0;
}
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 66d702e6b9ff..22297f9b0fd5 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2403,8 +2403,8 @@ xfs_ialloc_compute_maxlevels(
maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
XFS_INODES_PER_CHUNK_LOG;
- minleafrecs = mp->m_alloc_mnr[0];
- minnoderecs = mp->m_alloc_mnr[1];
+ minleafrecs = mp->m_inobt_mnr[0];
+ minnoderecs = mp->m_inobt_mnr[1];
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
for (level = 1; maxblocks > 1; level++)
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index c679f3c05b63..89c21d771e35 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -125,16 +125,8 @@ xfs_inobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- xfs_fsblock_t fsbno;
- int error;
-
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
- error = xfs_free_extent(cur->bc_tp, fsbno, 1);
- if (error)
- return error;
-
- xfs_trans_binval(cur->bc_tp, bp);
- return error;
+ return xfs_free_extent(cur->bc_tp,
+ XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 1aabfda669b0..9d9559eb2835 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -195,28 +195,50 @@ xfs_imap_to_bp(
}
void
-xfs_dinode_from_disk(
- xfs_icdinode_t *to,
- xfs_dinode_t *from)
+xfs_inode_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_dinode *from)
{
- to->di_magic = be16_to_cpu(from->di_magic);
- to->di_mode = be16_to_cpu(from->di_mode);
- to->di_version = from ->di_version;
+ struct xfs_icdinode *to = &ip->i_d;
+ struct inode *inode = VFS_I(ip);
+
+
+ /*
+ * Convert v1 inodes immediately to v2 inode format as this is the
+ * minimum inode version format we support in the rest of the code.
+ */
+ to->di_version = from->di_version;
+ if (to->di_version == 1) {
+ set_nlink(inode, be16_to_cpu(from->di_onlink));
+ to->di_projid_lo = 0;
+ to->di_projid_hi = 0;
+ to->di_version = 2;
+ } else {
+ set_nlink(inode, be32_to_cpu(from->di_nlink));
+ to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+ to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ }
+
to->di_format = from->di_format;
- to->di_onlink = be16_to_cpu(from->di_onlink);
to->di_uid = be32_to_cpu(from->di_uid);
to->di_gid = be32_to_cpu(from->di_gid);
- to->di_nlink = be32_to_cpu(from->di_nlink);
- to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
- to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_flushiter = be16_to_cpu(from->di_flushiter);
- to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
- to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
- to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+
+ /*
+ * Time is signed, so need to convert to signed 32 bit before
+ * storing in inode timestamp which may be 64 bit. Otherwise
+ * a time before epoch is converted to a time long after epoch
+ * on 64 bit systems.
+ */
+ inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
+ inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
+ inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
+ inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
+ inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
+ inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+ inode->i_generation = be32_to_cpu(from->di_gen);
+ inode->i_mode = be16_to_cpu(from->di_mode);
+
to->di_size = be64_to_cpu(from->di_size);
to->di_nblocks = be64_to_cpu(from->di_nblocks);
to->di_extsize = be32_to_cpu(from->di_extsize);
@@ -227,42 +249,96 @@ xfs_dinode_from_disk(
to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
to->di_dmstate = be16_to_cpu(from->di_dmstate);
to->di_flags = be16_to_cpu(from->di_flags);
- to->di_gen = be32_to_cpu(from->di_gen);
if (to->di_version == 3) {
- to->di_changecount = be64_to_cpu(from->di_changecount);
+ inode->i_version = be64_to_cpu(from->di_changecount);
to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
to->di_flags2 = be64_to_cpu(from->di_flags2);
- to->di_ino = be64_to_cpu(from->di_ino);
- to->di_lsn = be64_to_cpu(from->di_lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
- uuid_copy(&to->di_uuid, &from->di_uuid);
}
}
void
-xfs_dinode_to_disk(
- xfs_dinode_t *to,
- xfs_icdinode_t *from)
+xfs_inode_to_disk(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icdinode *from = &ip->i_d;
+ struct inode *inode = VFS_I(ip);
+
+ to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ to->di_onlink = 0;
+
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+
+ memset(to->di_pad, 0, sizeof(to->di_pad));
+ to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
+ to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+ to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
+ to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+ to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
+ to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ to->di_nlink = cpu_to_be32(inode->i_nlink);
+ to->di_gen = cpu_to_be32(inode->i_generation);
+ to->di_mode = cpu_to_be16(inode->i_mode);
+
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(inode->i_version);
+ to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+ to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+
+ to->di_ino = cpu_to_be64(ip->i_ino);
+ to->di_lsn = cpu_to_be64(lsn);
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ }
+}
+
+void
+xfs_log_dinode_to_disk(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
{
to->di_magic = cpu_to_be16(from->di_magic);
to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from ->di_version;
+ to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_onlink = cpu_to_be16(from->di_onlink);
+ to->di_onlink = 0;
to->di_uid = cpu_to_be32(from->di_uid);
to->di_gid = cpu_to_be32(from->di_gid);
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
@@ -367,13 +443,10 @@ xfs_iread(
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
/* initialise the on-disk inode core */
memset(&ip->i_d, 0, sizeof(ip->i_d));
- ip->i_d.di_magic = XFS_DINODE_MAGIC;
- ip->i_d.di_gen = prandom_u32();
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ VFS_I(ip)->i_generation = prandom_u32();
+ if (xfs_sb_version_hascrc(&mp->m_sb))
ip->i_d.di_version = 3;
- ip->i_d.di_ino = ip->i_ino;
- uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
- } else
+ else
ip->i_d.di_version = 2;
return 0;
}
@@ -403,7 +476,7 @@ xfs_iread(
* Otherwise, just get the truly permanent information.
*/
if (dip->di_mode) {
- xfs_dinode_from_disk(&ip->i_d, dip);
+ xfs_inode_from_disk(ip, dip);
error = xfs_iformat_fork(ip, dip);
if (error) {
#ifdef DEBUG
@@ -417,16 +490,10 @@ xfs_iread(
* Partial initialisation of the in-core inode. Just the bits
* that xfs_ialloc won't overwrite or relies on being correct.
*/
- ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
ip->i_d.di_version = dip->di_version;
- ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+ VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
- if (dip->di_version == 3) {
- ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
- uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
- }
-
/*
* Make sure to pull in the mode here as well in
* case the inode is released without being used.
@@ -434,25 +501,10 @@ xfs_iread(
* the inode is already free and not try to mess
* with the uninitialized part of it.
*/
- ip->i_d.di_mode = 0;
- }
-
- /*
- * Automatically convert version 1 inode formats in memory to version 2
- * inode format. If the inode is modified, it will get logged and
- * rewritten as a version 2 inode. We can do this because we set the
- * superblock feature bit for v2 inodes unconditionally during mount
- * and it means the reast of the code can assume the inode version is 2
- * or higher.
- */
- if (ip->i_d.di_version == 1) {
- ip->i_d.di_version = 2;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- ip->i_d.di_nlink = ip->i_d.di_onlink;
- ip->i_d.di_onlink = 0;
- xfs_set_projid(ip, 0);
+ VFS_I(ip)->i_mode = 0;
}
+ ASSERT(ip->i_d.di_version >= 2);
ip->i_delayed_blks = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..7c4dd321b215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -20,7 +20,36 @@
struct xfs_inode;
struct xfs_dinode;
-struct xfs_icdinode;
+
+/*
+ * In memory representation of the XFS inode. This is held in the in-core struct
+ * xfs_inode and represents the current on disk values but the structure is not
+ * in on-disk format. That is, this structure is always translated to on-disk
+ * format specific structures at the appropriate time.
+ */
+struct xfs_icdinode {
+ __int8_t di_version; /* inode version */
+ __int8_t di_format; /* format of di_c data */
+ __uint16_t di_flushiter; /* incremented on flush */
+ __uint32_t di_uid; /* owner's user id */
+ __uint32_t di_gid; /* owner's group id */
+ __uint16_t di_projid_lo; /* lower part of owner's project id */
+ __uint16_t di_projid_hi; /* higher part of owner's project id */
+ xfs_fsize_t di_size; /* number of bytes in file */
+ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
+ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
+ xfs_extnum_t di_nextents; /* number of extents in data fork */
+ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __int8_t di_aformat; /* format of attr fork's data */
+ __uint32_t di_dmevmask; /* DMIG event mask */
+ __uint16_t di_dmstate; /* DMIG state info */
+ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+
+ __uint64_t di_flags2; /* more random flags */
+
+ xfs_ictimestamp_t di_crtime; /* time created */
+};
/*
* Inode location information. Stored in the inode and passed to
@@ -38,8 +67,11 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, uint);
void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
-void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
+ xfs_lsn_t lsn);
+void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
+void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
+ struct xfs_dinode *to);
#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0defbd02f62d..11faf7df14c8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -31,6 +31,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_attr_sf.h"
+#include "xfs_da_format.h"
kmem_zone_t *xfs_ifork_zone;
@@ -120,7 +121,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
- switch (ip->i_d.di_mode & S_IFMT) {
+ switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 265314690415..d54a8018b079 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -290,6 +290,7 @@ typedef struct xfs_inode_log_format_64 {
__int32_t ilf_boffset; /* off of inode in buffer */
} xfs_inode_log_format_64_t;
+
/*
* Flags for xfs_trans_log_inode flags field.
*/
@@ -360,15 +361,15 @@ typedef struct xfs_ictimestamp {
} xfs_ictimestamp_t;
/*
- * NOTE: This structure must be kept identical to struct xfs_dinode
- * except for the endianness annotations.
+ * Define the format of the inode core that is logged. This structure must be
+ * kept identical to struct xfs_dinode except for the endianness annotations.
*/
-typedef struct xfs_icdinode {
+struct xfs_log_dinode {
__uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__uint16_t di_mode; /* mode and type of file */
__int8_t di_version; /* inode version */
__int8_t di_format; /* format of di_c data */
- __uint16_t di_onlink; /* old number of links to file */
+ __uint8_t di_pad3[2]; /* unused in v2/3 inodes */
__uint32_t di_uid; /* owner's user id */
__uint32_t di_gid; /* owner's group id */
__uint32_t di_nlink; /* number of links to file */
@@ -407,13 +408,13 @@ typedef struct xfs_icdinode {
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
+};
-static inline uint xfs_icdinode_size(int version)
+static inline uint xfs_log_dinode_size(int version)
{
if (version == 3)
- return sizeof(struct xfs_icdinode);
- return offsetof(struct xfs_icdinode, di_next_unlinked);
+ return sizeof(struct xfs_log_dinode);
+ return offsetof(struct xfs_log_dinode, di_next_unlinked);
}
/*
@@ -495,6 +496,8 @@ enum xfs_blft {
XFS_BLFT_ATTR_LEAF_BUF,
XFS_BLFT_ATTR_RMT_BUF,
XFS_BLFT_SB_BUF,
+ XFS_BLFT_RTBITMAP_BUF,
+ XFS_BLFT_RTSUMMARY_BUF,
XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
};
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index f51078f1e92a..8eed51275bb3 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -37,7 +37,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_DQ_PROJ 0x0002 /* project quota */
#define XFS_DQ_GROUP 0x0004 /* a group quota */
#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
+#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */
#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
@@ -116,6 +116,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
+#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 9b59ffa1fc19..951c044e24e4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -42,6 +42,31 @@
*/
/*
+ * Real time buffers need verifiers to avoid runtime warnings during IO.
+ * We don't have anything to verify, however, so these are just dummy
+ * operations.
+ */
+static void
+xfs_rtbuf_verify_read(
+ struct xfs_buf *bp)
+{
+ return;
+}
+
+static void
+xfs_rtbuf_verify_write(
+ struct xfs_buf *bp)
+{
+ return;
+}
+
+const struct xfs_buf_ops xfs_rtbuf_ops = {
+ .name = "rtbuf",
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+};
+
+/*
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
@@ -68,9 +93,12 @@ xfs_rtbuf_get(
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp, NULL);
+ mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
if (error)
return error;
+
+ xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
+ : XFS_BLFT_RTBITMAP_BUF);
*bpp = bp;
return 0;
}
@@ -983,7 +1011,7 @@ xfs_rtfree_extent(
mp->m_sb.sb_rextents) {
if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
- *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+ *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index b25bb9a343f3..961e6475a309 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,7 +27,6 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
extern void xfs_perag_put(struct xfs_perag *pag);
extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-extern void xfs_sb_calc_crc(struct xfs_buf *bp);
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 15c3ceb845b9..81ac870834da 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -53,6 +53,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
/*
* Transaction types. Used to distinguish types of buffers. These never reach
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a9ebabfe7587..d445a64b979e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
+#define XFS_DIO_FLAG_APPEND (1 << 1)
+
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+ struct xfs_bmbt_irec imap;
+ bool imap_valid;
+ unsigned int io_type;
+ struct xfs_ioend *ioend;
+ sector_t last_block;
+};
+
void
xfs_count_page_state(
struct page *page,
@@ -214,10 +229,12 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
int error = 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ /*
+ * Set an error if the mount has shut down and proceed with end I/O
+ * processing so it can perform whatever cleanups are necessary.
+ */
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
ioend->io_error = -EIO;
- goto done;
- }
/*
* For unwritten extents we need to issue transactions to convert a
@@ -265,7 +282,7 @@ xfs_alloc_ioend(
*/
atomic_set(&ioend->io_remaining, 1);
ioend->io_error = 0;
- ioend->io_list = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = type;
ioend->io_inode = inode;
ioend->io_buffer_head = NULL;
@@ -283,8 +300,7 @@ xfs_map_blocks(
struct inode *inode,
loff_t offset,
struct xfs_bmbt_irec *imap,
- int type,
- int nonblocking)
+ int type)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -300,12 +316,7 @@ xfs_map_blocks(
if (type == XFS_IO_UNWRITTEN)
bmapi_flags |= XFS_BMAPI_IGSTATE;
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
- if (nonblocking)
- return -EAGAIN;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- }
-
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -341,7 +352,7 @@ xfs_map_blocks(
return 0;
}
-STATIC int
+STATIC bool
xfs_imap_valid(
struct inode *inode,
struct xfs_bmbt_irec *imap,
@@ -414,8 +425,7 @@ xfs_start_buffer_writeback(
STATIC void
xfs_start_page_writeback(
struct page *page,
- int clear_dirty,
- int buffers)
+ int clear_dirty)
{
ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page));
@@ -434,10 +444,6 @@ xfs_start_page_writeback(
set_page_writeback_keepwrite(page);
unlock_page(page);
-
- /* If no buffers on the page are to be written, finish it here */
- if (!buffers)
- end_page_writeback(page);
}
static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -446,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
}
/*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
+ * time; the caller is responsible for chaining prior to submission.
*
* If @fail is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the ioend chain rather
* than submit it to IO. This typically only happens on a filesystem shutdown.
*/
-STATIC void
+STATIC int
xfs_submit_ioend(
struct writeback_control *wbc,
xfs_ioend_t *ioend,
- int fail)
+ int status)
{
- xfs_ioend_t *head = ioend;
- xfs_ioend_t *next;
struct buffer_head *bh;
struct bio *bio;
sector_t lastblock = 0;
- /* Pass 1 - start writeback */
- do {
- next = ioend->io_list;
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
- xfs_start_buffer_writeback(bh);
- } while ((ioend = next) != NULL);
+ /* Reserve log space if we might write beyond the on-disk inode size. */
+ if (!status &&
+ ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ status = xfs_setfilesize_trans_alloc(ioend);
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ if (status) {
+ ioend->io_error = status;
+ xfs_finish_ioend(ioend);
+ return status;
+ }
- /* Pass 2 - submit I/O */
- ioend = head;
- do {
- next = ioend->io_list;
- bio = NULL;
+ bio = NULL;
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
- /*
- * If we are failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- if (fail) {
- ioend->io_error = fail;
- xfs_finish_ioend(ioend);
- continue;
+ if (!bio) {
+retry:
+ bio = xfs_alloc_ioend_bio(bh);
+ } else if (bh->b_blocknr != lastblock + 1) {
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ goto retry;
}
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
- if (!bio) {
- retry:
- bio = xfs_alloc_ioend_bio(bh);
- } else if (bh->b_blocknr != lastblock + 1) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
-
- if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
-
- lastblock = bh->b_blocknr;
- }
- if (bio)
+ if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
xfs_submit_ioend_bio(wbc, ioend, bio);
- xfs_finish_ioend(ioend);
- } while ((ioend = next) != NULL);
-}
-
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too. Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
- xfs_ioend_t *ioend)
-{
- xfs_ioend_t *next;
- struct buffer_head *bh, *next_bh;
-
- do {
- next = ioend->io_list;
- bh = ioend->io_buffer_head;
- do {
- next_bh = bh->b_private;
- clear_buffer_async_write(bh);
- /*
- * The unwritten flag is cleared when added to the
- * ioend. We're not submitting for I/O so mark the
- * buffer unwritten again for next time around.
- */
- if (ioend->io_type == XFS_IO_UNWRITTEN)
- set_buffer_unwritten(bh);
- unlock_buffer(bh);
- } while ((bh = next_bh) != NULL);
+ goto retry;
+ }
- mempool_free(ioend, xfs_ioend_pool);
- } while ((ioend = next) != NULL);
+ lastblock = bh->b_blocknr;
+ }
+ if (bio)
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ xfs_finish_ioend(ioend);
+ return 0;
}
/*
* Test to see if we've been building up a completion structure for
* earlier buffers -- if so, we try to append to this ioend if we
* can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
*/
STATIC void
xfs_add_to_ioend(
struct inode *inode,
struct buffer_head *bh,
xfs_off_t offset,
- unsigned int type,
- xfs_ioend_t **result,
- int need_ioend)
+ struct xfs_writepage_ctx *wpc,
+ struct list_head *iolist)
{
- xfs_ioend_t *ioend = *result;
-
- if (!ioend || need_ioend || type != ioend->io_type) {
- xfs_ioend_t *previous = *result;
-
- ioend = xfs_alloc_ioend(inode, type);
- ioend->io_offset = offset;
- ioend->io_buffer_head = bh;
- ioend->io_buffer_tail = bh;
- if (previous)
- previous->io_list = ioend;
- *result = ioend;
+ if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ bh->b_blocknr != wpc->last_block + 1 ||
+ offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
+ struct xfs_ioend *new;
+
+ if (wpc->ioend)
+ list_add(&wpc->ioend->io_list, iolist);
+
+ new = xfs_alloc_ioend(inode, wpc->io_type);
+ new->io_offset = offset;
+ new->io_buffer_head = bh;
+ new->io_buffer_tail = bh;
+ wpc->ioend = new;
} else {
- ioend->io_buffer_tail->b_private = bh;
- ioend->io_buffer_tail = bh;
+ wpc->ioend->io_buffer_tail->b_private = bh;
+ wpc->ioend->io_buffer_tail = bh;
}
bh->b_private = NULL;
- ioend->io_size += bh->b_size;
+ wpc->ioend->io_size += bh->b_size;
+ wpc->last_block = bh->b_blocknr;
+ xfs_start_buffer_writeback(bh);
}
STATIC void
@@ -678,183 +632,6 @@ xfs_check_page_type(
return false;
}
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
- struct inode *inode,
- struct page *page,
- loff_t tindex,
- struct xfs_bmbt_irec *imap,
- xfs_ioend_t **ioendp,
- struct writeback_control *wbc)
-{
- struct buffer_head *bh, *head;
- xfs_off_t end_offset;
- unsigned long p_offset;
- unsigned int type;
- int len, page_dirty;
- int count = 0, done = 0, uptodate = 1;
- xfs_off_t offset = page_offset(page);
-
- if (page->index != tindex)
- goto fail;
- if (!trylock_page(page))
- goto fail;
- if (PageWriteback(page))
- goto fail_unlock_page;
- if (page->mapping != inode->i_mapping)
- goto fail_unlock_page;
- if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
- goto fail_unlock_page;
-
- /*
- * page_dirty is initially a count of buffers on the page before
- * EOF and is decremented as we move each into a cleanable state.
- *
- * Derivation:
- *
- * End offset is the highest offset that this page should represent.
- * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
- * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
- * hence give us the correct page_dirty count. On any other page,
- * it will be zero and in that case we need page_dirty to be the
- * count of buffers on the page.
- */
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- i_size_read(inode));
-
- /*
- * If the current map does not span the entire page we are about to try
- * to write, then give up. The only way we can write a page that spans
- * multiple mappings in a single writeback iteration is via the
- * xfs_vm_writepage() function. Data integrity writeback requires the
- * entire page to be written in a single attempt, otherwise the part of
- * the page we don't write here doesn't get written as part of the data
- * integrity sync.
- *
- * For normal writeback, we also don't attempt to write partial pages
- * here as it simply means that write_cache_pages() will see it under
- * writeback and ignore the page until some point in the future, at
- * which time this will be the only page in the file that needs
- * writeback. Hence for more optimal IO patterns, we should always
- * avoid partial page writeback due to multiple mappings on a page here.
- */
- if (!xfs_imap_valid(inode, imap, end_offset))
- goto fail_unlock_page;
-
- len = 1 << inode->i_blkbits;
- p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
- PAGE_CACHE_SIZE);
- p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
- page_dirty = p_offset / len;
-
- /*
- * The moment we find a buffer that doesn't match our current type
- * specification or can't be written, abort the loop and start
- * writeback. As per the above xfs_imap_valid() check, only
- * xfs_vm_writepage() can handle partial page writeback fully - we are
- * limited here to the buffers that are contiguous with the current
- * ioend, and hence a buffer we can't write breaks that contiguity and
- * we have to defer the rest of the IO to xfs_vm_writepage().
- */
- bh = head = page_buffers(page);
- do {
- if (offset >= end_offset)
- break;
- if (!buffer_uptodate(bh))
- uptodate = 0;
- if (!(PageUptodate(page) || buffer_uptodate(bh))) {
- done = 1;
- break;
- }
-
- if (buffer_unwritten(bh) || buffer_delay(bh) ||
- buffer_mapped(bh)) {
- if (buffer_unwritten(bh))
- type = XFS_IO_UNWRITTEN;
- else if (buffer_delay(bh))
- type = XFS_IO_DELALLOC;
- else
- type = XFS_IO_OVERWRITE;
-
- /*
- * imap should always be valid because of the above
- * partial page end_offset check on the imap.
- */
- ASSERT(xfs_imap_valid(inode, imap, offset));
-
- lock_buffer(bh);
- if (type != XFS_IO_OVERWRITE)
- xfs_map_at_offset(inode, bh, imap, offset);
- xfs_add_to_ioend(inode, bh, offset, type,
- ioendp, done);
-
- page_dirty--;
- count++;
- } else {
- done = 1;
- break;
- }
- } while (offset += len, (bh = bh->b_this_page) != head);
-
- if (uptodate && bh == head)
- SetPageUptodate(page);
-
- if (count) {
- if (--wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE)
- done = 1;
- }
- xfs_start_page_writeback(page, !page_dirty, count);
-
- return done;
- fail_unlock_page:
- unlock_page(page);
- fail:
- return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
- struct inode *inode,
- pgoff_t tindex,
- struct xfs_bmbt_irec *imap,
- xfs_ioend_t **ioendp,
- struct writeback_control *wbc,
- pgoff_t tlast)
-{
- struct pagevec pvec;
- int done = 0, i;
-
- pagevec_init(&pvec, 0);
- while (!done && tindex <= tlast) {
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- imap, ioendp, wbc);
- if (done)
- break;
- }
-
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
STATIC void
xfs_vm_invalidatepage(
struct page *page,
@@ -932,6 +709,164 @@ out_invalidate:
}
/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected. While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+xfs_writepage_map(
+ struct xfs_writepage_ctx *wpc,
+ struct writeback_control *wbc,
+ struct inode *inode,
+ struct page *page,
+ loff_t offset,
+ __uint64_t end_offset)
+{
+ LIST_HEAD(submit_list);
+ struct xfs_ioend *ioend, *next;
+ struct buffer_head *bh, *head;
+ ssize_t len = 1 << inode->i_blkbits;
+ int error = 0;
+ int count = 0;
+ int uptodate = 1;
+
+ bh = head = page_buffers(page);
+ offset = page_offset(page);
+ do {
+ if (offset >= end_offset)
+ break;
+ if (!buffer_uptodate(bh))
+ uptodate = 0;
+
+ /*
+ * set_page_dirty dirties all buffers in a page, independent
+ * of their state. The dirty state however is entirely
+ * meaningless for holes (!mapped && uptodate), so skip
+ * buffers covering holes here.
+ */
+ if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+ wpc->imap_valid = false;
+ continue;
+ }
+
+ if (buffer_unwritten(bh)) {
+ if (wpc->io_type != XFS_IO_UNWRITTEN) {
+ wpc->io_type = XFS_IO_UNWRITTEN;
+ wpc->imap_valid = false;
+ }
+ } else if (buffer_delay(bh)) {
+ if (wpc->io_type != XFS_IO_DELALLOC) {
+ wpc->io_type = XFS_IO_DELALLOC;
+ wpc->imap_valid = false;
+ }
+ } else if (buffer_uptodate(bh)) {
+ if (wpc->io_type != XFS_IO_OVERWRITE) {
+ wpc->io_type = XFS_IO_OVERWRITE;
+ wpc->imap_valid = false;
+ }
+ } else {
+ if (PageUptodate(page))
+ ASSERT(buffer_mapped(bh));
+ /*
+ * This buffer is not uptodate and will not be
+ * written to disk. Ensure that we will put any
+ * subsequent writeable buffers into a new
+ * ioend.
+ */
+ wpc->imap_valid = false;
+ continue;
+ }
+
+ if (wpc->imap_valid)
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+ offset);
+ if (!wpc->imap_valid) {
+ error = xfs_map_blocks(inode, offset, &wpc->imap,
+ wpc->io_type);
+ if (error)
+ goto out;
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+ offset);
+ }
+ if (wpc->imap_valid) {
+ lock_buffer(bh);
+ if (wpc->io_type != XFS_IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+ count++;
+ }
+
+ } while (offset += len, ((bh = bh->b_this_page) != head));
+
+ if (uptodate && bh == head)
+ SetPageUptodate(page);
+
+ ASSERT(wpc->ioend || list_empty(&submit_list));
+
+out:
+ /*
+ * On error, we have to fail the ioend here because we have locked
+ * buffers in the ioend. If we don't do this, we'll deadlock
+ * invalidating the page as that tries to lock the buffers on the page.
+ * Also, because we may have set pages under writeback, we have to make
+ * sure we run IO completion to mark the error state of the IO
+ * appropriately, so we can't cancel the ioend directly here. That means
+ * we have to mark this page as under writeback if we included any
+ * buffers from it in the ioend chain so that completion treats it
+ * correctly.
+ *
+ * If we didn't include the page in the ioend, the on error we can
+ * simply discard and unlock it as there are no other users of the page
+ * or it's buffers right now. The caller will still need to trigger
+ * submission of outstanding ioends on the writepage context so they are
+ * treated correctly on error.
+ */
+ if (count) {
+ xfs_start_page_writeback(page, !error);
+
+ /*
+ * Preserve the original error if there was one, otherwise catch
+ * submission errors here and propagate into subsequent ioend
+ * submissions.
+ */
+ list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+ int error2;
+
+ list_del_init(&ioend->io_list);
+ error2 = xfs_submit_ioend(wbc, ioend, error);
+ if (error2 && !error)
+ error = error2;
+ }
+ } else if (error) {
+ xfs_aops_discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ } else {
+ /*
+ * We can end up here with no error and nothing to write if we
+ * race with a partial page truncate on a sub-page block sized
+ * filesystem. In that case we need to mark the page clean.
+ */
+ xfs_start_page_writeback(page, 1);
+ end_page_writeback(page);
+ }
+
+ mapping_set_error(page->mapping, error);
+ return error;
+}
+
+/*
* Write out a dirty page.
*
* For delalloc space on the page we need to allocate space and flush it.
@@ -940,22 +875,16 @@ out_invalidate:
* For any other dirty buffer heads on the page we should flush them.
*/
STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ void *data)
{
+ struct xfs_writepage_ctx *wpc = data;
struct inode *inode = page->mapping->host;
- struct buffer_head *bh, *head;
- struct xfs_bmbt_irec imap;
- xfs_ioend_t *ioend = NULL, *iohead = NULL;
loff_t offset;
- unsigned int type;
__uint64_t end_offset;
- pgoff_t end_index, last_index;
- ssize_t len;
- int err, imap_valid = 0, uptodate = 1;
- int count = 0;
- int nonblocking = 0;
+ pgoff_t end_index;
trace_xfs_writepage(inode, page, 0, 0);
@@ -982,12 +911,9 @@ xfs_vm_writepage(
if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
goto redirty;
- /* Is this page beyond the end of the file? */
- offset = i_size_read(inode);
- end_index = offset >> PAGE_CACHE_SHIFT;
- last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-
/*
+ * Is this page beyond the end of the file?
+ *
* The page index is less than the end_index, adjust the end_offset
* to the highest offset that this page should represent.
* -----------------------------------------------------
@@ -998,6 +924,8 @@ xfs_vm_writepage(
* | desired writeback range | see else |
* ---------------------------------^------------------|
*/
+ offset = i_size_read(inode);
+ end_index = offset >> PAGE_CACHE_SHIFT;
if (page->index < end_index)
end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
else {
@@ -1049,152 +977,7 @@ xfs_vm_writepage(
end_offset = offset;
}
- len = 1 << inode->i_blkbits;
-
- bh = head = page_buffers(page);
- offset = page_offset(page);
- type = XFS_IO_OVERWRITE;
-
- if (wbc->sync_mode == WB_SYNC_NONE)
- nonblocking = 1;
-
- do {
- int new_ioend = 0;
-
- if (offset >= end_offset)
- break;
- if (!buffer_uptodate(bh))
- uptodate = 0;
-
- /*
- * set_page_dirty dirties all buffers in a page, independent
- * of their state. The dirty state however is entirely
- * meaningless for holes (!mapped && uptodate), so skip
- * buffers covering holes here.
- */
- if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
- imap_valid = 0;
- continue;
- }
-
- if (buffer_unwritten(bh)) {
- if (type != XFS_IO_UNWRITTEN) {
- type = XFS_IO_UNWRITTEN;
- imap_valid = 0;
- }
- } else if (buffer_delay(bh)) {
- if (type != XFS_IO_DELALLOC) {
- type = XFS_IO_DELALLOC;
- imap_valid = 0;
- }
- } else if (buffer_uptodate(bh)) {
- if (type != XFS_IO_OVERWRITE) {
- type = XFS_IO_OVERWRITE;
- imap_valid = 0;
- }
- } else {
- if (PageUptodate(page))
- ASSERT(buffer_mapped(bh));
- /*
- * This buffer is not uptodate and will not be
- * written to disk. Ensure that we will put any
- * subsequent writeable buffers into a new
- * ioend.
- */
- imap_valid = 0;
- continue;
- }
-
- if (imap_valid)
- imap_valid = xfs_imap_valid(inode, &imap, offset);
- if (!imap_valid) {
- /*
- * If we didn't have a valid mapping then we need to
- * put the new mapping into a separate ioend structure.
- * This ensures non-contiguous extents always have
- * separate ioends, which is particularly important
- * for unwritten extent conversion at I/O completion
- * time.
- */
- new_ioend = 1;
- err = xfs_map_blocks(inode, offset, &imap, type,
- nonblocking);
- if (err)
- goto error;
- imap_valid = xfs_imap_valid(inode, &imap, offset);
- }
- if (imap_valid) {
- lock_buffer(bh);
- if (type != XFS_IO_OVERWRITE)
- xfs_map_at_offset(inode, bh, &imap, offset);
- xfs_add_to_ioend(inode, bh, offset, type, &ioend,
- new_ioend);
- count++;
- }
-
- if (!iohead)
- iohead = ioend;
-
- } while (offset += len, ((bh = bh->b_this_page) != head));
-
- if (uptodate && bh == head)
- SetPageUptodate(page);
-
- xfs_start_page_writeback(page, 1, count);
-
- /* if there is no IO to be submitted for this page, we are done */
- if (!ioend)
- return 0;
-
- ASSERT(iohead);
-
- /*
- * Any errors from this point onwards need tobe reported through the IO
- * completion path as we have marked the initial page as under writeback
- * and unlocked it.
- */
- if (imap_valid) {
- xfs_off_t end_index;
-
- end_index = imap.br_startoff + imap.br_blockcount;
-
- /* to bytes */
- end_index <<= inode->i_blkbits;
-
- /* to pages */
- end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-
- /* check against file size */
- if (end_index > last_index)
- end_index = last_index;
-
- xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
- wbc, end_index);
- }
-
-
- /*
- * Reserve log space if we might write beyond the on-disk inode size.
- */
- err = 0;
- if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
- err = xfs_setfilesize_trans_alloc(ioend);
-
- xfs_submit_ioend(wbc, iohead, err);
-
- return 0;
-
-error:
- if (iohead)
- xfs_cancel_ioend(iohead);
-
- if (err == -EAGAIN)
- goto redirty;
-
- xfs_aops_discard_page(page);
- ClearPageUptodate(page);
- unlock_page(page);
- return err;
+ return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
redirty:
redirty_page_for_writepage(wbc, page);
@@ -1203,16 +986,40 @@ redirty:
}
STATIC int
+xfs_vm_writepage(
+ struct page *page,
+ struct writeback_control *wbc)
+{
+ struct xfs_writepage_ctx wpc = {
+ .io_type = XFS_IO_INVALID,
+ };
+ int ret;
+
+ ret = xfs_do_writepage(page, wbc, &wpc);
+ if (wpc.ioend)
+ ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+ return ret;
+}
+
+STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct xfs_writepage_ctx wpc = {
+ .io_type = XFS_IO_INVALID,
+ };
+ int ret;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
if (dax_mapping(mapping))
return dax_writeback_mapping_range(mapping,
xfs_find_bdev_for_inode(mapping->host), wbc);
- return generic_writepages(mapping, wbc);
+ ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+ if (wpc.ioend)
+ ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+ return ret;
}
/*
@@ -1242,27 +1049,8 @@ xfs_vm_releasepage(
}
/*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
- * type of write IO we are doing. This passes to the completion function the
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- * ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
+ * When we map a DIO buffer, we may need to pass flags to
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
*
* Note that for DIO, an IO to the highest supported file block offset (i.e.
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1270,68 +1058,26 @@ xfs_vm_releasepage(
* extending the file size. We won't know for sure until IO completion is run
* and the actual max write offset is communicated to the IO completion
* routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
*/
-
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset,
- bool dax_fault)
+ xfs_off_t offset)
{
- struct xfs_ioend *ioend;
+ uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
xfs_off_t size = bh_result->b_size;
- int type;
-
- if (ISUNWRITTEN(imap))
- type = XFS_IO_UNWRITTEN;
- else
- type = XFS_IO_OVERWRITE;
- trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
-
- if (dax_fault) {
- ASSERT(type == XFS_IO_OVERWRITE);
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
- imap);
- return;
- }
+ trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+ ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
- if (bh_result->b_private) {
- ioend = bh_result->b_private;
- ASSERT(ioend->io_size > 0);
- ASSERT(offset >= ioend->io_offset);
- if (offset + size > ioend->io_offset + ioend->io_size)
- ioend->io_size = offset - ioend->io_offset + size;
-
- if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
- ioend->io_type = XFS_IO_UNWRITTEN;
-
- trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
- ioend->io_size, ioend->io_type,
- imap);
- } else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode) ||
- offset + size < 0) {
- ioend = xfs_alloc_ioend(inode, type);
- ioend->io_offset = offset;
- ioend->io_size = size;
-
- bh_result->b_private = ioend;
+ if (ISUNWRITTEN(imap)) {
+ *flags |= XFS_DIO_FLAG_UNWRITTEN;
+ set_buffer_defer_completion(bh_result);
+ } else if (offset + size > i_size_read(inode) || offset + size < 0) {
+ *flags |= XFS_DIO_FLAG_APPEND;
set_buffer_defer_completion(bh_result);
-
- trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
- imap);
- } else {
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
- imap);
}
}
@@ -1502,9 +1248,12 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset,
- dax_fault);
+ if (create && direct) {
+ if (dax_fault)
+ ASSERT(!ISUNWRITTEN(&imap));
+ else
+ xfs_map_direct(inode, bh_result, &imap, offset);
+ }
}
/*
@@ -1574,42 +1323,50 @@ xfs_get_blocks_dax_fault(
return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
}
-static void
-__xfs_end_io_direct_write(
- struct inode *inode,
- struct xfs_ioend *ioend,
+/*
+ * Complete a direct I/O write request.
+ *
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do. If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
loff_t offset,
- ssize_t size)
+ ssize_t size,
+ void *private)
{
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ uintptr_t flags = (uintptr_t)private;
+ int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
- goto out_end_io;
+ trace_xfs_end_io_direct_write(ip, offset, size);
- /*
- * dio completion end_io functions are only called on writes if more
- * than 0 bytes was written.
- */
- ASSERT(size > 0);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
- /*
- * The ioend only maps whole blocks, while the IO may be sector aligned.
- * Hence the ioend offset/size may not match the IO offset/size exactly.
- * Because we don't map overwrites within EOF into the ioend, the offset
- * may not match, but only if the endio spans EOF. Either way, write
- * the IO sizes into the ioend so that completion processing does the
- * right thing.
- */
- ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
- ioend->io_size = size;
- ioend->io_offset = offset;
+ if (size <= 0)
+ return size;
/*
- * The ioend tells us whether we are doing unwritten extent conversion
+ * The flags tell us whether we are doing unwritten extent conversions
* or an append transaction that updates the on-disk file size. These
* cases are the only cases where we should *potentially* be needing
* to update the VFS inode size.
- *
+ */
+ if (flags == 0) {
+ ASSERT(offset + size <= i_size_read(inode));
+ return 0;
+ }
+
+ /*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
@@ -1620,91 +1377,56 @@ __xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
- spin_lock(&XFS_I(inode)->i_flags_lock);
+ spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
- spin_unlock(&XFS_I(inode)->i_flags_lock);
+ spin_unlock(&ip->i_flags_lock);
- /*
- * If we are doing an append IO that needs to update the EOF on disk,
- * do the transaction reserve now so we can use common end io
- * processing. Stashing the error (if there is one) in the ioend will
- * result in the ioend processing passing on the error if it is
- * possible as we can't return it from here.
- */
- if (ioend->io_type == XFS_IO_OVERWRITE)
- ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+ if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+ trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-out_end_io:
- xfs_end_io(&ioend->io_work);
- return;
-}
+ error = xfs_iomap_write_unwritten(ip, offset, size);
+ } else if (flags & XFS_DIO_FLAG_APPEND) {
+ struct xfs_trans *tp;
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_ioend *ioend = private;
-
- trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
- ioend ? ioend->io_type : 0, NULL);
+ trace_xfs_end_io_direct_write_append(ip, offset, size);
- if (!ioend) {
- ASSERT(offset + size <= i_size_read(inode));
- return;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+ error = xfs_setfilesize(ip, tp, offset, size);
}
- __xfs_end_io_direct_write(inode, ioend, offset, size);
+ return error;
}
-static inline ssize_t
-xfs_vm_do_dio(
- struct inode *inode,
+STATIC ssize_t
+xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter,
- loff_t offset,
- void (*endio)(struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private),
- int flags)
+ loff_t offset)
{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ dio_iodone_t *endio = NULL;
+ int flags = 0;
struct block_device *bdev;
- if (IS_DAX(inode))
+ if (iov_iter_rw(iter) == WRITE) {
+ endio = xfs_end_io_direct_write;
+ flags = DIO_ASYNC_EXTEND;
+ }
+
+ if (IS_DAX(inode)) {
return dax_do_io(iocb, inode, iter, offset,
xfs_get_blocks_direct, endio, 0);
+ }
bdev = xfs_find_bdev_for_inode(inode);
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-STATIC ssize_t
-xfs_vm_direct_IO(
- struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
-
- if (iov_iter_rw(iter) == WRITE)
- return xfs_vm_do_dio(inode, iocb, iter, offset,
- xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
- return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
+ xfs_get_blocks_direct, endio, NULL, flags);
}
/*
@@ -1756,6 +1478,7 @@ xfs_vm_write_failed(
loff_t from = pos & (PAGE_CACHE_SIZE - 1);
loff_t to = from + len;
struct buffer_head *bh, *head;
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
/*
* The request pos offset might be 32 or 64 bit, this is all fine
@@ -1787,14 +1510,23 @@ xfs_vm_write_failed(
if (block_start >= to)
break;
- if (!buffer_delay(bh))
+ /*
+ * Process delalloc and unwritten buffers beyond EOF. We can
+ * encounter unwritten buffers in the event that a file has
+ * post-EOF unwritten extents and an extending write happens to
+ * fail (e.g., an unaligned write that also involves a delalloc
+ * to the same page).
+ */
+ if (!buffer_delay(bh) && !buffer_unwritten(bh))
continue;
- if (!buffer_new(bh) && block_offset < i_size_read(inode))
+ if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
+ block_offset < i_size_read(inode))
continue;
- xfs_vm_kill_delalloc_range(inode, block_offset,
- block_offset + bh->b_size);
+ if (buffer_delay(bh))
+ xfs_vm_kill_delalloc_range(inode, block_offset,
+ block_offset + bh->b_size);
/*
* This buffer does not contain data anymore. make sure anyone
@@ -1805,6 +1537,7 @@ xfs_vm_write_failed(
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_dirty(bh);
+ clear_buffer_unwritten(bh);
}
}
@@ -1828,6 +1561,7 @@ xfs_vm_write_begin(
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct page *page;
int status;
+ struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
ASSERT(len <= PAGE_CACHE_SIZE);
@@ -1836,6 +1570,8 @@ xfs_vm_write_begin(
return -ENOMEM;
status = __block_write_begin(page, pos, len, xfs_get_blocks);
+ if (xfs_mp_fail_writes(mp))
+ status = -EIO;
if (unlikely(status)) {
struct inode *inode = mapping->host;
size_t isize = i_size_read(inode);
@@ -1848,6 +1584,8 @@ xfs_vm_write_begin(
* allocated in this write, not blocks that were previously
* written successfully.
*/
+ if (xfs_mp_fail_writes(mp))
+ isize = 0;
if (pos + len > isize) {
ssize_t start = max_t(ssize_t, pos, isize);
@@ -1957,7 +1695,6 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
- struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1978,10 +1715,10 @@ xfs_vm_set_page_dirty(
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1992,13 +1729,13 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index a4343c63fb38..b4421177b68d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool;
* Types of I/O for bmap clustering and I/O completion tracking.
*/
enum {
+ XFS_IO_INVALID, /* initial state */
XFS_IO_DELALLOC, /* covers delalloc region */
XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
XFS_IO_OVERWRITE, /* covers already allocated extent */
};
#define XFS_IO_TYPES \
+ { XFS_IO_INVALID, "invalid" }, \
{ XFS_IO_DELALLOC, "delalloc" }, \
{ XFS_IO_UNWRITTEN, "unwritten" }, \
{ XFS_IO_OVERWRITE, "overwrite" }
@@ -39,7 +41,7 @@ enum {
* It can manage several multi-page bio's at once.
*/
typedef struct xfs_ioend {
- struct xfs_ioend *io_list; /* next ioend in chain */
+ struct list_head io_list; /* next ioend in chain */
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 0ef7c2ed3f8a..4fa14820e2e2 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sbp->namelen,
sbp->valuelen,
&sbp->name[sbp->namelen]);
- if (error)
+ if (error) {
+ kmem_free(sbuf);
return error;
+ }
if (context->seen_enough)
break;
cursor->offset++;
@@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int(
args.rmtblkcnt = xfs_attr3_rmt_blocks(
args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args);
- if (retval)
- return retval;
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- args.value);
+ if (!retval)
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ args.value);
kmem_free(args.value);
} else {
retval = context->put_listent(context,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6c876012b2e5..a32c1dcae2ff 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -203,10 +203,12 @@ xfs_bmap_rtalloc(
ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
/*
- * Lock out other modifications to the RT bitmap inode.
+ * Lock out modifications to both the RT bitmap and summary inodes
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
* If it's an allocation to an empty file at offset 0,
@@ -822,7 +824,7 @@ bool
xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
{
/* prealloc/delalloc exists only on regular files */
- if (!S_ISREG(ip->i_d.di_mode))
+ if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
/*
@@ -1727,7 +1729,7 @@ xfs_swap_extents(
xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
/* Verify that both files have the same format */
- if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
error = -EINVAL;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 435c7de42e5f..9a2191b91137 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -650,7 +650,7 @@ xfs_buf_read_map(
if (bp) {
trace_xfs_buf_read(bp, flags, _RET_IP_);
- if (!XFS_BUF_ISDONE(bp)) {
+ if (!(bp->b_flags & XBF_DONE)) {
XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
_xfs_buf_read(bp, flags);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c75721acd867..4eb89bd4ee73 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -302,6 +302,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
@@ -312,31 +313,6 @@ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);
-#define XFS_BUF_ZEROFLAGS(bp) \
- ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
- XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
- XBF_WRITE_FAIL))
-
-void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
-#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
-
-#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
-
-#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
-
-#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
-
-#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
-
/*
* These macros use the IO block map rather than b_bn. b_bn is now really
* just for the buffer cache index for cached buffers. As IO does not use b_bn
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7e986da34f6c..99e91a0e554e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -431,7 +431,7 @@ xfs_buf_item_unpin(
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
- ASSERT(XFS_BUF_ISSTALE(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
trace_xfs_buf_item_unpin_stale(bip);
@@ -493,7 +493,7 @@ xfs_buf_item_unpin(
xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioerror(bp, -EIO);
- XFS_BUF_UNDONE(bp);
+ bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioend(bp);
}
@@ -1067,7 +1067,7 @@ xfs_buf_iodone_callbacks(
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
trace_xfs_buf_item_iodone(bp, _RET_IP_);
goto do_callbacks;
}
@@ -1090,7 +1090,7 @@ xfs_buf_iodone_callbacks(
* errors tend to affect the whole device and a failing log write
* will make us give up. But we really ought to do better here.
*/
- if (XFS_BUF_ISASYNC(bp)) {
+ if (bp->b_flags & XBF_ASYNC) {
ASSERT(bp->b_iodone != NULL);
trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1113,7 +1113,7 @@ xfs_buf_iodone_callbacks(
* sure to return the error to the caller of xfs_bwrite().
*/
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
trace_xfs_buf_error_relse(bp, _RET_IP_);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 642d55d10075..93b3ab0c5435 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -665,7 +665,7 @@ xfs_readdir(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index e85a9519a5ae..272c3f8b6f7d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -227,7 +227,7 @@ xfs_discard_extents(
GFP_NOFS, 0);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
- "discard failed for extent [0x%llu,%u], error %d",
+ "discard failed for extent [0x%llx,%u], error %d",
(unsigned long long)busyp->bno,
busyp->length,
error);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9c44d38dcd1f..316b2a1bdba5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -92,26 +92,28 @@ xfs_qm_adjust_dqlimits(
{
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_disk_dquot *d = &dq->q_core;
+ struct xfs_def_quota *defq;
int prealloc = 0;
ASSERT(d->d_id);
+ defq = xfs_get_defquota(dq, q);
- if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
- d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+ if (defq->bsoftlimit && !d->d_blk_softlimit) {
+ d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
prealloc = 1;
}
- if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
- d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+ if (defq->bhardlimit && !d->d_blk_hardlimit) {
+ d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
prealloc = 1;
}
- if (q->qi_isoftlimit && !d->d_ino_softlimit)
- d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
- if (q->qi_ihardlimit && !d->d_ino_hardlimit)
- d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
- if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
- d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
- if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
- d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+ if (defq->isoftlimit && !d->d_ino_softlimit)
+ d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
+ if (defq->ihardlimit && !d->d_ino_hardlimit)
+ d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
+ if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
+ d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
+ if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
+ d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
if (prealloc)
xfs_dquot_set_prealloc_limits(dq);
@@ -232,7 +234,8 @@ xfs_qm_init_dquot_blk(
{
struct xfs_quotainfo *q = mp->m_quotainfo;
xfs_dqblk_t *d;
- int curid, i;
+ xfs_dqid_t curid;
+ int i;
ASSERT(tp);
ASSERT(xfs_buf_islocked(bp));
@@ -243,7 +246,6 @@ xfs_qm_init_dquot_blk(
* ID of the first dquot in the block - id's are zero based.
*/
curid = id - (id % q->qi_dqperchunk);
- ASSERT(curid >= 0);
memset(d, 0, BBTOB(q->qi_dqchunklen));
for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
@@ -464,12 +466,13 @@ xfs_qm_dqtobp(
struct xfs_bmbt_irec map;
int nmaps = 1, error;
struct xfs_buf *bp;
- struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_inode *quotip;
struct xfs_mount *mp = dqp->q_mount;
xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
struct xfs_trans *tp = (tpp ? *tpp : NULL);
uint lock_mode;
+ quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
lock_mode = xfs_ilock_data_map_shared(quotip);
@@ -685,6 +688,56 @@ error0:
}
/*
+ * Advance to the next id in the current chunk, or if at the
+ * end of the chunk, skip ahead to first id in next allocated chunk
+ * using the SEEK_DATA interface.
+ */
+int
+xfs_dq_get_next_id(
+ xfs_mount_t *mp,
+ uint type,
+ xfs_dqid_t *id,
+ loff_t eof)
+{
+ struct xfs_inode *quotip;
+ xfs_fsblock_t start;
+ loff_t offset;
+ uint lock;
+ xfs_dqid_t next_id;
+ int error = 0;
+
+ /* Simple advance */
+ next_id = *id + 1;
+
+ /* If new ID is within the current chunk, advancing it sufficed */
+ if (next_id % mp->m_quotainfo->qi_dqperchunk) {
+ *id = next_id;
+ return 0;
+ }
+
+ /* Nope, next_id is now past the current chunk, so find the next one */
+ start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
+
+ quotip = xfs_quota_inode(mp, type);
+ lock = xfs_ilock_data_map_shared(quotip);
+
+ offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start),
+ eof, SEEK_DATA);
+ if (offset < 0)
+ error = offset;
+
+ xfs_iunlock(quotip, lock);
+
+ /* -ENXIO is essentially "no more data" */
+ if (error)
+ return (error == -ENXIO ? -ENOENT: error);
+
+ /* Convert next data offset back to a quota id */
+ *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk;
+ return 0;
+}
+
+/*
* Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
* a locked dquot, doing an allocation (if requested) as needed.
* When both an inode and an id are given, the inode's id takes precedence.
@@ -704,6 +757,7 @@ xfs_qm_dqget(
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
+ loff_t eof = 0;
int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -731,6 +785,21 @@ xfs_qm_dqget(
}
#endif
+ /* Get the end of the quota file if we need it */
+ if (flags & XFS_QMOPT_DQNEXT) {
+ struct xfs_inode *quotip;
+ xfs_fileoff_t last;
+ uint lock_mode;
+
+ quotip = xfs_quota_inode(mp, type);
+ lock_mode = xfs_ilock_data_map_shared(quotip);
+ error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK);
+ xfs_iunlock(quotip, lock_mode);
+ if (error)
+ return error;
+ eof = XFS_FSB_TO_B(mp, last);
+ }
+
restart:
mutex_lock(&qi->qi_tree_lock);
dqp = radix_tree_lookup(tree, id);
@@ -744,6 +813,18 @@ restart:
goto restart;
}
+ /* uninit / unused quota found in radix tree, keep looking */
+ if (flags & XFS_QMOPT_DQNEXT) {
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ error = xfs_dq_get_next_id(mp, type, &id, eof);
+ if (error)
+ return error;
+ goto restart;
+ }
+ }
+
dqp->q_nrefs++;
mutex_unlock(&qi->qi_tree_lock);
@@ -770,6 +851,13 @@ restart:
if (ip)
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ /* If we are asked to find next active id, keep looking */
+ if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
+ error = xfs_dq_get_next_id(mp, type, &id, eof);
+ if (!error)
+ goto restart;
+ }
+
if (error)
return error;
@@ -820,6 +908,17 @@ restart:
qi->qi_dquots++;
mutex_unlock(&qi->qi_tree_lock);
+ /* If we are asked to find next active id, keep looking */
+ if (flags & XFS_QMOPT_DQNEXT) {
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ xfs_qm_dqput(dqp);
+ error = xfs_dq_get_next_id(mp, type, &id, eof);
+ if (error)
+ return error;
+ goto restart;
+ }
+ }
+
dqret:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
trace_xfs_dqget_miss(dqp);
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 652cd3c5b58c..a1b2dd828b9d 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -152,7 +152,7 @@ xfs_nfs_get_inode(
return ERR_PTR(error);
}
- if (ip->i_d.di_gen != generation) {
+ if (VFS_I(ip)->i_generation != generation) {
IRELE(ip);
return ERR_PTR(-ESTALE);
}
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
.commit_metadata = xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_PNFS
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
.get_uuid = xfs_fs_get_uuid,
.map_blocks = xfs_fs_map_blocks,
.commit_blocks = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52883ac3cf84..ac0fd32de31e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -156,9 +156,9 @@ xfs_update_prealloc_flags(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
if (!(flags & XFS_PREALLOC_INVISIBLE)) {
- ip->i_d.di_mode &= ~S_ISUID;
- if (ip->i_d.di_mode & S_IXGRP)
- ip->i_d.di_mode &= ~S_ISGID;
+ VFS_I(ip)->i_mode &= ~S_ISUID;
+ if (VFS_I(ip)->i_mode & S_IXGRP)
+ VFS_I(ip)->i_mode &= ~S_ISGID;
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
}
@@ -1337,31 +1337,31 @@ out:
return found;
}
-STATIC loff_t
-xfs_seek_hole_data(
- struct file *file,
+/*
+ * caller must lock inode with xfs_ilock_data_map_shared,
+ * can we craft an appropriate ASSERT?
+ *
+ * end is because the VFS-level lseek interface is defined such that any
+ * offset past i_size shall return -ENXIO, but we use this for quota code
+ * which does not maintain i_size, and we want to SEEK_DATA past i_size.
+ */
+loff_t
+__xfs_seek_hole_data(
+ struct inode *inode,
loff_t start,
+ loff_t end,
int whence)
{
- struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
loff_t uninitialized_var(offset);
- xfs_fsize_t isize;
xfs_fileoff_t fsbno;
- xfs_filblks_t end;
- uint lock;
+ xfs_filblks_t lastbno;
int error;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- lock = xfs_ilock_data_map_shared(ip);
-
- isize = i_size_read(inode);
- if (start >= isize) {
+ if (start >= end) {
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
/*
@@ -1369,22 +1369,22 @@ xfs_seek_hole_data(
* by fsbno to the end block of the file.
*/
fsbno = XFS_B_TO_FSBT(mp, start);
- end = XFS_B_TO_FSB(mp, isize);
+ lastbno = XFS_B_TO_FSB(mp, end);
for (;;) {
struct xfs_bmbt_irec map[2];
int nmap = 2;
unsigned int i;
- error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
XFS_BMAPI_ENTIRE);
if (error)
- goto out_unlock;
+ goto out_error;
/* No extents at given offset, must be beyond EOF */
if (nmap == 0) {
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
for (i = 0; i < nmap; i++) {
@@ -1426,7 +1426,7 @@ xfs_seek_hole_data(
* hole at the end of any file).
*/
if (whence == SEEK_HOLE) {
- offset = isize;
+ offset = end;
break;
}
/*
@@ -1434,7 +1434,7 @@ xfs_seek_hole_data(
*/
ASSERT(whence == SEEK_DATA);
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
ASSERT(i > 1);
@@ -1445,14 +1445,14 @@ xfs_seek_hole_data(
*/
fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
start = XFS_FSB_TO_B(mp, fsbno);
- if (start >= isize) {
+ if (start >= end) {
if (whence == SEEK_HOLE) {
- offset = isize;
+ offset = end;
break;
}
ASSERT(whence == SEEK_DATA);
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
}
@@ -1464,7 +1464,39 @@ out:
* situation in particular.
*/
if (whence == SEEK_HOLE)
- offset = min_t(loff_t, offset, isize);
+ offset = min_t(loff_t, offset, end);
+
+ return offset;
+
+out_error:
+ return error;
+}
+
+STATIC loff_t
+xfs_seek_hole_data(
+ struct file *file,
+ loff_t start,
+ int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ uint lock;
+ loff_t offset, end;
+ int error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lock = xfs_ilock_data_map_shared(ip);
+
+ end = i_size_read(inode);
+ offset = __xfs_seek_hole_data(inode, start, end, whence);
+ if (offset < 0) {
+ error = offset;
+ goto out_unlock;
+ }
+
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c4c130f9bfb6..a51353a1f87f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -151,7 +151,7 @@ xfs_filestream_pick_ag(
xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
int err, trylock, nscan;
- ASSERT(S_ISDIR(ip->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
@@ -319,7 +319,7 @@ xfs_filestream_lookup_ag(
xfs_agnumber_t startag, ag = NULLAGNUMBER;
struct xfs_mru_cache_elem *mru;
- ASSERT(S_ISREG(ip->i_d.di_mode));
+ ASSERT(S_ISREG(VFS_I(ip)->i_mode));
pip = xfs_filestream_get_parent(ip);
if (!pip)
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 1b6a98b66886..f32713f14f9a 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,5 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d7a490f24ead..bf2d60749278 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,6 +63,9 @@ xfs_inode_alloc(
return NULL;
}
+ /* VFS doesn't initialise i_mode! */
+ VFS_I(ip)->i_mode = 0;
+
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
@@ -79,7 +82,7 @@ xfs_inode_alloc(
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
- memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+ memset(&ip->i_d, 0, sizeof(ip->i_d));
return ip;
}
@@ -98,7 +101,7 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
- switch (ip->i_d.di_mode & S_IFMT) {
+ switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
@@ -135,6 +138,34 @@ xfs_inode_free(
}
/*
+ * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+ * part of the structure. This is made more complex by the fact we store
+ * information about the on-disk values in the VFS inode and so we can't just
+ * overwrite the values unconditionally. Hence we save the parameters we
+ * need to retain across reinitialisation, and rewrite them into the VFS inode
+ * after reinitialisation even if it fails.
+ */
+static int
+xfs_reinit_inode(
+ struct xfs_mount *mp,
+ struct inode *inode)
+{
+ int error;
+ uint32_t nlink = inode->i_nlink;
+ uint32_t generation = inode->i_generation;
+ uint64_t version = inode->i_version;
+ umode_t mode = inode->i_mode;
+
+ error = inode_init_always(mp->m_super, inode);
+
+ set_nlink(inode, nlink);
+ inode->i_generation = generation;
+ inode->i_version = version;
+ inode->i_mode = mode;
+ return error;
+}
+
+/*
* Check the validity of the inode we just found it the cache
*/
static int
@@ -185,7 +216,7 @@ xfs_iget_cache_hit(
/*
* If lookup is racing with unlink return an error immediately.
*/
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
error = -ENOENT;
goto out_error;
}
@@ -208,7 +239,7 @@ xfs_iget_cache_hit(
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
- error = inode_init_always(mp->m_super, inode);
+ error = xfs_reinit_inode(mp, inode);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
@@ -295,7 +326,7 @@ xfs_iget_cache_miss(
trace_xfs_iget_miss(ip);
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
error = -ENOENT;
goto out_destroy;
}
@@ -444,7 +475,7 @@ again:
* If we have a real type for an on-disk inode, we can setup the inode
* now. If it's a new inode being created, xfs_ialloc will handle it.
*/
- if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
xfs_setup_existing_inode(ip);
return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ceba1a83cacc..96f606deee31 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -57,9 +57,9 @@ kmem_zone_t *xfs_inode_zone;
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
-STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-
-STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
+STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
+STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
/*
* helper function to extract extent size hint from inode
@@ -766,6 +766,7 @@ xfs_ialloc(
uint flags;
int error;
struct timespec tv;
+ struct inode *inode;
/*
* Call the space management code to pick
@@ -791,6 +792,7 @@ xfs_ialloc(
if (error)
return error;
ASSERT(ip != NULL);
+ inode = VFS_I(ip);
/*
* We always convert v1 inodes to v2 now - we only support filesystems
@@ -800,20 +802,16 @@ xfs_ialloc(
if (ip->i_d.di_version == 1)
ip->i_d.di_version = 2;
- ip->i_d.di_mode = mode;
- ip->i_d.di_onlink = 0;
- ip->i_d.di_nlink = nlink;
- ASSERT(ip->i_d.di_nlink == nlink);
+ inode->i_mode = mode;
+ set_nlink(inode, nlink);
ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
xfs_set_projid(ip, prid);
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
- if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
- ip->i_d.di_mode |= S_ISGID;
- }
+ if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
}
/*
@@ -822,38 +820,29 @@ xfs_ialloc(
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
if ((irix_sgid_inherit) &&
- (ip->i_d.di_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
- ip->i_d.di_mode &= ~S_ISGID;
- }
+ (inode->i_mode & S_ISGID) &&
+ (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
tv = current_fs_time(mp->m_super);
- ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
- ip->i_d.di_atime = ip->i_d.di_mtime;
- ip->i_d.di_ctime = ip->i_d.di_mtime;
+ inode->i_mtime = tv;
+ inode->i_atime = tv;
+ inode->i_ctime = tv;
- /*
- * di_gen will have been taken care of in xfs_iread.
- */
ip->i_d.di_extsize = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
if (ip->i_d.di_version == 3) {
- ASSERT(ip->i_d.di_ino == ino);
- ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
- ip->i_d.di_crc = 0;
- ip->i_d.di_changecount = 1;
- ip->i_d.di_lsn = 0;
+ inode->i_version = 1;
ip->i_d.di_flags2 = 0;
- memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
- ip->i_d.di_crtime = ip->i_d.di_mtime;
+ ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
+ ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
}
@@ -1092,35 +1081,24 @@ xfs_dir_ialloc(
}
/*
- * Decrement the link count on an inode & log the change.
- * If this causes the link count to go to zero, initiate the
- * logging activity required to truncate a file.
+ * Decrement the link count on an inode & log the change. If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
*/
int /* error */
xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
{
- int error;
-
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT (ip->i_d.di_nlink > 0);
- ip->i_d.di_nlink--;
drop_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = 0;
- if (ip->i_d.di_nlink == 0) {
- /*
- * We're dropping the last link to this file.
- * Move the on-disk inode to the AGI unlinked list.
- * From xfs_inactive() we will pull the inode from
- * the list and free it.
- */
- error = xfs_iunlink(tp, ip);
- }
- return error;
+ if (VFS_I(ip)->i_nlink)
+ return 0;
+
+ return xfs_iunlink(tp, ip);
}
/*
@@ -1134,8 +1112,6 @@ xfs_bumplink(
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
ASSERT(ip->i_d.di_version > 1);
- ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
- ip->i_d.di_nlink++;
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return 0;
@@ -1393,7 +1369,6 @@ xfs_create_tmpfile(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- ip->i_d.di_nlink--;
error = xfs_iunlink(tp, ip);
if (error)
goto out_trans_cancel;
@@ -1444,7 +1419,7 @@ xfs_link(
trace_xfs_link(tdp, target_name);
- ASSERT(!S_ISDIR(sip->i_d.di_mode));
+ ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1492,7 +1467,10 @@ xfs_link(
xfs_bmap_init(&free_list, &first_block);
- if (sip->i_d.di_nlink == 0) {
+ /*
+ * Handle initial link state of O_TMPFILE inode
+ */
+ if (VFS_I(sip)->i_nlink == 0) {
error = xfs_iunlink_remove(tp, sip);
if (error)
goto error_return;
@@ -1648,7 +1626,7 @@ xfs_release(
xfs_mount_t *mp = ip->i_mount;
int error;
- if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+ if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
return 0;
/* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1679,7 +1657,7 @@ xfs_release(
}
}
- if (ip->i_d.di_nlink == 0)
+ if (VFS_I(ip)->i_nlink == 0)
return 0;
if (xfs_can_free_eofblocks(ip, false)) {
@@ -1883,7 +1861,7 @@ xfs_inactive(
* If the inode is already free, then there can be nothing
* to clean up here.
*/
- if (ip->i_d.di_mode == 0) {
+ if (VFS_I(ip)->i_mode == 0) {
ASSERT(ip->i_df.if_real_bytes == 0);
ASSERT(ip->i_df.if_broot_bytes == 0);
return;
@@ -1895,7 +1873,7 @@ xfs_inactive(
if (mp->m_flags & XFS_MOUNT_RDONLY)
return;
- if (ip->i_d.di_nlink != 0) {
+ if (VFS_I(ip)->i_nlink != 0) {
/*
* force is true because we are evicting an inode from the
* cache. Post-eof blocks must be freed, lest we end up with
@@ -1907,7 +1885,7 @@ xfs_inactive(
return;
}
- if (S_ISREG(ip->i_d.di_mode) &&
+ if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
@@ -1916,7 +1894,7 @@ xfs_inactive(
if (error)
return;
- if (S_ISLNK(ip->i_d.di_mode))
+ if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
error = xfs_inactive_truncate(ip);
@@ -1952,16 +1930,21 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0.
- * We place the on-disk inode on a list in the AGI. It
- * will be pulled from this list when the inode is freed.
+ * This is called when the inode's link count goes to 0 or we are creating a
+ * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
+ * set to true as the link count is dropped to zero by the VFS after we've
+ * created the file successfully, so we have to add it to the unlinked list
+ * while the link count is non-zero.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
*/
-int
+STATIC int
xfs_iunlink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp;
+ xfs_mount_t *mp = tp->t_mountp;
xfs_agi_t *agi;
xfs_dinode_t *dip;
xfs_buf_t *agibp;
@@ -1971,10 +1954,7 @@ xfs_iunlink(
int offset;
int error;
- ASSERT(ip->i_d.di_nlink == 0);
- ASSERT(ip->i_d.di_mode != 0);
-
- mp = tp->t_mountp;
+ ASSERT(VFS_I(ip)->i_mode != 0);
/*
* Get the agi buffer first. It ensures lock ordering
@@ -2412,10 +2392,10 @@ xfs_ifree(
struct xfs_icluster xic = { 0 };
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_d.di_nlink == 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
@@ -2429,7 +2409,7 @@ xfs_ifree(
if (error)
return error;
- ip->i_d.di_mode = 0; /* mark incore inode as free */
+ VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
@@ -2439,7 +2419,7 @@ xfs_ifree(
* Bump the generation count so no one will be confused
* by reincarnations of this inode.
*/
- ip->i_d.di_gen++;
+ VFS_I(ip)->i_generation++;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xic.deleted)
@@ -2526,7 +2506,7 @@ xfs_remove(
{
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
- int is_dir = S_ISDIR(ip->i_d.di_mode);
+ int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
@@ -2580,8 +2560,8 @@ xfs_remove(
* If we're removing a directory perform some additional validation.
*/
if (is_dir) {
- ASSERT(ip->i_d.di_nlink >= 2);
- if (ip->i_d.di_nlink != 2) {
+ ASSERT(VFS_I(ip)->i_nlink >= 2);
+ if (VFS_I(ip)->i_nlink != 2) {
error = -ENOTEMPTY;
goto out_trans_cancel;
}
@@ -2771,7 +2751,7 @@ xfs_cross_rename(
if (dp1 != dp2) {
dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
- if (S_ISDIR(ip2->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip2)->i_mode)) {
error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
dp1->i_ino, first_block,
free_list, spaceres);
@@ -2779,7 +2759,7 @@ xfs_cross_rename(
goto out_trans_abort;
/* transfer ip2 ".." reference to dp1 */
- if (!S_ISDIR(ip1->i_d.di_mode)) {
+ if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
error = xfs_droplink(tp, dp2);
if (error)
goto out_trans_abort;
@@ -2798,7 +2778,7 @@ xfs_cross_rename(
ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
}
- if (S_ISDIR(ip1->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip1)->i_mode)) {
error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
dp2->i_ino, first_block,
free_list, spaceres);
@@ -2806,7 +2786,7 @@ xfs_cross_rename(
goto out_trans_abort;
/* transfer ip1 ".." reference to dp2 */
- if (!S_ISDIR(ip2->i_d.di_mode)) {
+ if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
error = xfs_droplink(tp, dp1);
if (error)
goto out_trans_abort;
@@ -2903,7 +2883,7 @@ xfs_rename(
struct xfs_inode *inodes[__XFS_SORT_INODES];
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
- bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+ bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
int spaceres;
int error;
@@ -3032,12 +3012,12 @@ xfs_rename(
* target and source are directories and that target can be
* destroyed, or that neither is a directory.
*/
- if (S_ISDIR(target_ip->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
/*
* Make sure target dir is empty.
*/
if (!(xfs_dir_isempty(target_ip)) ||
- (target_ip->i_d.di_nlink > 2)) {
+ (VFS_I(target_ip)->i_nlink > 2)) {
error = -EEXIST;
goto out_trans_cancel;
}
@@ -3144,7 +3124,7 @@ xfs_rename(
* intermediate state on disk.
*/
if (wip) {
- ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
+ ASSERT(VFS_I(wip)->i_nlink == 0);
error = xfs_bumplink(tp, wip);
if (error)
goto out_bmap_cancel;
@@ -3313,7 +3293,7 @@ cluster_corrupt_out:
* mark it as stale and brelse.
*/
if (bp->b_iodone) {
- XFS_BUF_UNDONE(bp);
+ bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioerror(bp, -EIO);
xfs_buf_ioend(bp);
@@ -3462,14 +3442,7 @@ xfs_iflush_int(
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto corrupt_out;
}
- if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
- mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
- xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
- __func__, ip->i_ino, ip, ip->i_d.di_magic);
- goto corrupt_out;
- }
- if (S_ISREG(ip->i_d.di_mode)) {
+ if (S_ISREG(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -3479,7 +3452,7 @@ xfs_iflush_int(
__func__, ip->i_ino, ip);
goto corrupt_out;
}
- } else if (S_ISDIR(ip->i_d.di_mode)) {
+ } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
@@ -3523,12 +3496,11 @@ xfs_iflush_int(
ip->i_d.di_flushiter++;
/*
- * Copy the dirty parts of the inode into the on-disk
- * inode. We always copy out the core of the inode,
- * because if the inode is dirty at all the core must
- * be.
+ * Copy the dirty parts of the inode into the on-disk inode. We always
+ * copy out the core of the inode, because if the inode is dirty at all
+ * the core must be.
*/
- xfs_dinode_to_disk(dip, &ip->i_d);
+ xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
/* Wrap, we never let the log put out DI_MAX_FLUSH */
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3580,10 +3552,6 @@ xfs_iflush_int(
*/
xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
- /* update the lsn in the on disk inode if required */
- if (ip->i_d.di_version == 3)
- dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
-
/* generate the checksum. */
xfs_dinode_calc_crc(mp, dip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca9e11989cbd..43e1d51b15eb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -63,7 +63,7 @@ typedef struct xfs_inode {
unsigned long i_flags; /* see defined flags below */
unsigned int i_delayed_blks; /* count of delay alloc blks */
- xfs_icdinode_t i_d; /* most of ondisk inode */
+ struct xfs_icdinode i_d; /* most of ondisk inode */
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -88,7 +88,7 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
*/
static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
{
- if (S_ISREG(ip->i_d.di_mode))
+ if (S_ISREG(VFS_I(ip)->i_mode))
return i_size_read(VFS_I(ip));
return ip->i_d.di_size;
}
@@ -369,7 +369,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
*/
#define XFS_INHERIT_GID(pip) \
(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
- ((pip)->i_d.di_mode & S_ISGID))
+ (VFS_I(pip)->i_mode & S_ISGID))
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
@@ -405,8 +405,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
-int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
@@ -437,6 +435,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
xfs_fsize_t isize, bool *did_zeroing);
int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
+ loff_t eof, int whence);
/* from xfs_iops.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d14b12b8cfef..c48b5b18d771 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -135,7 +135,7 @@ xfs_inode_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_icdinode_size(ip->i_d.di_version);
+ xfs_log_dinode_size(ip->i_d.di_version);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
if (XFS_IFORK_Q(ip))
@@ -322,6 +322,81 @@ xfs_inode_item_format_attr_fork(
}
}
+static void
+xfs_inode_to_log_dinode(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icdinode *from = &ip->i_d;
+ struct inode *inode = VFS_I(ip);
+
+ to->di_magic = XFS_DINODE_MAGIC;
+
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_uid = from->di_uid;
+ to->di_gid = from->di_gid;
+ to->di_projid_lo = from->di_projid_lo;
+ to->di_projid_hi = from->di_projid_hi;
+
+ memset(to->di_pad, 0, sizeof(to->di_pad));
+ memset(to->di_pad3, 0, sizeof(to->di_pad3));
+ to->di_atime.t_sec = inode->i_atime.tv_sec;
+ to->di_atime.t_nsec = inode->i_atime.tv_nsec;
+ to->di_mtime.t_sec = inode->i_mtime.tv_sec;
+ to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
+ to->di_ctime.t_sec = inode->i_ctime.tv_sec;
+ to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+ to->di_nlink = inode->i_nlink;
+ to->di_gen = inode->i_generation;
+ to->di_mode = inode->i_mode;
+
+ to->di_size = from->di_size;
+ to->di_nblocks = from->di_nblocks;
+ to->di_extsize = from->di_extsize;
+ to->di_nextents = from->di_nextents;
+ to->di_anextents = from->di_anextents;
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = from->di_dmevmask;
+ to->di_dmstate = from->di_dmstate;
+ to->di_flags = from->di_flags;
+
+ if (from->di_version == 3) {
+ to->di_changecount = inode->i_version;
+ to->di_crtime.t_sec = from->di_crtime.t_sec;
+ to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+ to->di_flags2 = from->di_flags2;
+
+ to->di_ino = ip->i_ino;
+ to->di_lsn = lsn;
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = from->di_flushiter;
+ }
+}
+
+/*
+ * Format the inode core. Current timestamp data is only in the VFS inode
+ * fields, so we need to grab them from there. Hence rather than just copying
+ * the XFS inode core structure, format the fields directly into the iovec.
+ */
+static void
+xfs_inode_item_format_core(
+ struct xfs_inode *ip,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp)
+{
+ struct xfs_log_dinode *dic;
+
+ dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
+ xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
+ xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+}
+
/*
* This is called to fill in the vector of log iovecs for the given inode
* log item. It fills the first item with an inode log format structure,
@@ -351,10 +426,7 @@ xfs_inode_item_format(
ilf->ilf_size = 2; /* format + core */
xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
- xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
- &ip->i_d,
- xfs_icdinode_size(ip->i_d.di_version));
-
+ xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
if (XFS_IFORK_Q(ip)) {
xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 478d04e07f95..bcb6c19ce3ea 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -114,7 +114,7 @@ xfs_find_handle(
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
- handle.ha_fid.fid_gen = ip->i_d.di_gen;
+ handle.ha_fid.fid_gen = inode->i_generation;
handle.ha_fid.fid_ino = ip->i_ino;
hsize = XFS_HSIZE(handle);
@@ -963,7 +963,7 @@ xfs_set_diflags(
di_flags |= XFS_DIFLAG_NODEFRAG;
if (xflags & FS_XFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
- if (S_ISDIR(ip->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
if (xflags & FS_XFLAG_RTINHERIT)
di_flags |= XFS_DIFLAG_RTINHERIT;
if (xflags & FS_XFLAG_NOSYMLINKS)
@@ -972,7 +972,7 @@ xfs_set_diflags(
di_flags |= XFS_DIFLAG_EXTSZINHERIT;
if (xflags & FS_XFLAG_PROJINHERIT)
di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(ip->i_d.di_mode)) {
+ } else if (S_ISREG(VFS_I(ip)->i_mode)) {
if (xflags & FS_XFLAG_REALTIME)
di_flags |= XFS_DIFLAG_REALTIME;
if (xflags & FS_XFLAG_EXTSIZE)
@@ -1060,23 +1060,86 @@ xfs_ioctl_setattr_xflags(
}
/*
+ * If we are changing DAX flags, we have to ensure the file is clean and any
+ * cached objects in the address space are invalidated and removed. This
+ * requires us to lock out other IO and page faults similar to a truncate
+ * operation. The locks need to be held until the transaction has been committed
+ * so that the cache invalidation is atomic with respect to the DAX flag
+ * manipulation.
+ */
+static int
+xfs_ioctl_setattr_dax_invalidate(
+ struct xfs_inode *ip,
+ struct fsxattr *fa,
+ int *join_flags)
+{
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ *join_flags = 0;
+
+ /*
+ * It is only valid to set the DAX flag on regular files and
+ * directories on filesystems where the block size is equal to the page
+ * size. On directories it serves as an inherit hint.
+ */
+ if (fa->fsx_xflags & FS_XFLAG_DAX) {
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EINVAL;
+ if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+ return -EINVAL;
+ }
+
+ /* If the DAX state is not changing, we have nothing to do here. */
+ if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
+ return 0;
+ if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
+ return 0;
+
+ /* lock, flush and invalidate mapping in preparation for flag change */
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+ error = filemap_write_and_wait(inode->i_mapping);
+ if (error)
+ goto out_unlock;
+ error = invalidate_inode_pages2(inode->i_mapping);
+ if (error)
+ goto out_unlock;
+
+ *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
+ return 0;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+
+}
+
+/*
* Set up the transaction structure for the setattr operation, checking that we
* have permission to do so. On success, return a clean transaction and the
* inode locked exclusively ready for further operation specific checks. On
* failure, return an error without modifying or locking the inode.
+ *
+ * The inode might already be IO locked on call. If this is the case, it is
+ * indicated in @join_flags and we take full responsibility for ensuring they
+ * are unlocked from now on. Hence if we have an error here, we still have to
+ * unlock them. Otherwise, once they are joined to the transaction, they will
+ * be unlocked on commit/cancel.
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ int join_flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- int error;
+ int error = -EROFS;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return ERR_PTR(-EROFS);
+ goto out_unlock;
+ error = -EIO;
if (XFS_FORCED_SHUTDOWN(mp))
- return ERR_PTR(-EIO);
+ goto out_unlock;
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -1084,7 +1147,8 @@ xfs_ioctl_setattr_get_trans(
goto out_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
+ join_flags = 0;
/*
* CAP_FOWNER overrides the following restrictions:
@@ -1104,6 +1168,9 @@ xfs_ioctl_setattr_get_trans(
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
+ if (join_flags)
+ xfs_iunlock(ip, join_flags);
return ERR_PTR(error);
}
@@ -1128,14 +1195,14 @@ xfs_ioctl_setattr_check_extsize(
{
struct xfs_mount *mp = ip->i_mount;
- if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
return -EINVAL;
if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(ip->i_d.di_mode))
+ !S_ISDIR(VFS_I(ip)->i_mode))
return -EINVAL;
- if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+ if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
@@ -1202,6 +1269,7 @@ xfs_ioctl_setattr(
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
+ int join_flags = 0;
trace_xfs_ioctl_setattr(ip);
@@ -1225,7 +1293,18 @@ xfs_ioctl_setattr(
return code;
}
- tp = xfs_ioctl_setattr_get_trans(ip);
+ /*
+ * Changing DAX config may require inode locking for mapping
+ * invalidation. These need to be held all the way to transaction commit
+ * or cancel time, so need to be passed through to
+ * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+ * appropriately.
+ */
+ code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
+ if (code)
+ goto error_free_dquots;
+
+ tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
if (IS_ERR(tp)) {
code = PTR_ERR(tp);
goto error_free_dquots;
@@ -1256,9 +1335,9 @@ xfs_ioctl_setattr(
* successful return from chown()
*/
- if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
!capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
- ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+ VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
if (xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1341,6 +1420,7 @@ xfs_ioc_setxflags(
struct xfs_trans *tp;
struct fsxattr fa;
unsigned int flags;
+ int join_flags = 0;
int error;
if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1357,7 +1437,18 @@ xfs_ioc_setxflags(
if (error)
return error;
- tp = xfs_ioctl_setattr_get_trans(ip);
+ /*
+ * Changing DAX config may require inode locking for mapping
+ * invalidation. These need to be held all the way to transaction commit
+ * or cancel time, so need to be passed through to
+ * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+ * appropriately.
+ */
+ error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
+ if (error)
+ goto out_drop_write;
+
+ tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 76b71a1c6c32..fb7dc61f4a29 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -459,8 +459,8 @@ xfs_vn_getattr(
stat->size = XFS_ISIZE(ip);
stat->dev = inode->i_sb->s_dev;
- stat->mode = ip->i_d.di_mode;
- stat->nlink = ip->i_d.di_nlink;
+ stat->mode = inode->i_mode;
+ stat->nlink = inode->i_nlink;
stat->uid = inode->i_uid;
stat->gid = inode->i_gid;
stat->ino = ip->i_ino;
@@ -506,9 +506,6 @@ xfs_setattr_mode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= mode & ~S_IFMT;
-
inode->i_mode &= S_IFMT;
inode->i_mode |= mode & ~S_IFMT;
}
@@ -522,21 +519,12 @@ xfs_setattr_time(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (iattr->ia_valid & ATTR_ATIME) {
+ if (iattr->ia_valid & ATTR_ATIME)
inode->i_atime = iattr->ia_atime;
- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- }
- if (iattr->ia_valid & ATTR_CTIME) {
+ if (iattr->ia_valid & ATTR_CTIME)
inode->i_ctime = iattr->ia_ctime;
- ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- }
- if (iattr->ia_valid & ATTR_MTIME) {
+ if (iattr->ia_valid & ATTR_MTIME)
inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- }
}
int
@@ -661,9 +649,9 @@ xfs_setattr_nonsize(
* The set-user-ID and set-group-ID bits of a file will be
* cleared upon successful return from chown()
*/
- if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
!capable(CAP_FSETID))
- ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+ inode->i_mode &= ~(S_ISUID|S_ISGID);
/*
* Change the ownerships and register quota modifications
@@ -773,7 +761,7 @@ xfs_setattr_size(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
- ASSERT(S_ISREG(ip->i_d.di_mode));
+ ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -991,21 +979,13 @@ xfs_vn_update_time(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (flags & S_CTIME) {
+ if (flags & S_CTIME)
inode->i_ctime = *now;
- ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
- }
- if (flags & S_MTIME) {
+ if (flags & S_MTIME)
inode->i_mtime = *now;
- ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
- }
- if (flags & S_ATIME) {
+ if (flags & S_ATIME)
inode->i_atime = *now;
- ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
- }
+
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
return xfs_trans_commit(tp);
@@ -1205,8 +1185,10 @@ xfs_diflags_to_iflags(
inode->i_flags |= S_SYNC;
if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
- ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ if (S_ISREG(inode->i_mode) &&
+ ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
+ (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+ ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
inode->i_flags |= S_DAX;
}
@@ -1232,8 +1214,6 @@ xfs_setup_inode(
/* make the inode look hashed for the writeback code */
hlist_add_fake(&inode->i_hash);
- inode->i_mode = ip->i_d.di_mode;
- set_nlink(inode, ip->i_d.di_nlink);
inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
@@ -1249,14 +1229,7 @@ xfs_setup_inode(
break;
}
- inode->i_generation = ip->i_d.di_gen;
i_size_write(inode, ip->i_d.di_size);
- inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
- inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
- inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
xfs_diflags_to_iflags(inode, ip);
ip->d_ops = ip->i_mount->m_nondir_inode_ops;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 930ebd86beba..ce73eb34620d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -57,6 +57,7 @@ xfs_bulkstat_one_int(
{
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
+ struct inode *inode;
struct xfs_bstat *buf; /* return buffer */
int error = 0; /* error value */
@@ -77,30 +78,33 @@ xfs_bulkstat_one_int(
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
+ inode = VFS_I(ip);
dic = &ip->i_d;
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_nlink = dic->di_nlink;
buf->bs_projid_lo = dic->di_projid_lo;
buf->bs_projid_hi = dic->di_projid_hi;
buf->bs_ino = ino;
- buf->bs_mode = dic->di_mode;
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
- buf->bs_atime.tv_sec = dic->di_atime.t_sec;
- buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
- buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
- buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
- buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
- buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+
+ buf->bs_nlink = inode->i_nlink;
+ buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+ buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+ buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_gen = inode->i_generation;
+ buf->bs_mode = inode->i_mode;
+
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
- buf->bs_gen = dic->di_gen;
memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
buf->bs_dmevmask = dic->di_dmevmask;
buf->bs_dmstate = dic->di_dmstate;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9c9a1c9bcc7f..b49ccf5c1d75 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1212,7 +1212,7 @@ xlog_iodone(xfs_buf_t *bp)
}
/* log I/O is always issued ASYNC */
- ASSERT(XFS_BUF_ISASYNC(bp));
+ ASSERT(bp->b_flags & XBF_ASYNC);
xlog_state_done_syncing(iclog, aborted);
/*
@@ -1864,9 +1864,8 @@ xlog_sync(
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
- XFS_BUF_ZEROFLAGS(bp);
- XFS_BUF_ASYNC(bp);
- bp->b_flags |= XBF_SYNCIO;
+ bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
bp->b_flags |= XBF_FUA;
@@ -1893,12 +1892,11 @@ xlog_sync(
/* account for log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+
/*
* Don't call xfs_bwrite here. We do log-syncs even when the filesystem
* is shutting down.
*/
- XFS_BUF_WRITE(bp);
-
error = xlog_bdstrat(bp);
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_sync");
@@ -1910,9 +1908,8 @@ xlog_sync(
xfs_buf_associate_memory(bp,
(char *)&iclog->ic_header + count, split);
bp->b_fspriv = iclog;
- XFS_BUF_ZEROFLAGS(bp);
- XFS_BUF_ASYNC(bp);
- bp->b_flags |= XBF_SYNCIO;
+ bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
bp->b_flags |= XBF_FUA;
@@ -1921,7 +1918,6 @@ xlog_sync(
/* account for internal log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
- XFS_BUF_WRITE(bp);
error = xlog_bdstrat(bp);
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
@@ -2012,77 +2008,81 @@ xlog_print_tic_res(
uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
/* match with XLOG_REG_TYPE_* in xfs_log.h */
- static char *res_type_str[XLOG_REG_TYPE_MAX] = {
- "bformat",
- "bchunk",
- "efi_format",
- "efd_format",
- "iformat",
- "icore",
- "iext",
- "ibroot",
- "ilocal",
- "iattr_ext",
- "iattr_broot",
- "iattr_local",
- "qformat",
- "dquot",
- "quotaoff",
- "LR header",
- "unmount",
- "commit",
- "trans header"
+#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
+ static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+ REG_TYPE_STR(BFORMAT, "bformat"),
+ REG_TYPE_STR(BCHUNK, "bchunk"),
+ REG_TYPE_STR(EFI_FORMAT, "efi_format"),
+ REG_TYPE_STR(EFD_FORMAT, "efd_format"),
+ REG_TYPE_STR(IFORMAT, "iformat"),
+ REG_TYPE_STR(ICORE, "icore"),
+ REG_TYPE_STR(IEXT, "iext"),
+ REG_TYPE_STR(IBROOT, "ibroot"),
+ REG_TYPE_STR(ILOCAL, "ilocal"),
+ REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
+ REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
+ REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
+ REG_TYPE_STR(QFORMAT, "qformat"),
+ REG_TYPE_STR(DQUOT, "dquot"),
+ REG_TYPE_STR(QUOTAOFF, "quotaoff"),
+ REG_TYPE_STR(LRHEADER, "LR header"),
+ REG_TYPE_STR(UNMOUNT, "unmount"),
+ REG_TYPE_STR(COMMIT, "commit"),
+ REG_TYPE_STR(TRANSHDR, "trans header"),
+ REG_TYPE_STR(ICREATE, "inode create")
};
+#undef REG_TYPE_STR
+#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type
static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
- "SETATTR_NOT_SIZE",
- "SETATTR_SIZE",
- "INACTIVE",
- "CREATE",
- "CREATE_TRUNC",
- "TRUNCATE_FILE",
- "REMOVE",
- "LINK",
- "RENAME",
- "MKDIR",
- "RMDIR",
- "SYMLINK",
- "SET_DMATTRS",
- "GROWFS",
- "STRAT_WRITE",
- "DIOSTRAT",
- "WRITE_SYNC",
- "WRITEID",
- "ADDAFORK",
- "ATTRINVAL",
- "ATRUNCATE",
- "ATTR_SET",
- "ATTR_RM",
- "ATTR_FLAG",
- "CLEAR_AGI_BUCKET",
- "QM_SBCHANGE",
- "DUMMY1",
- "DUMMY2",
- "QM_QUOTAOFF",
- "QM_DQALLOC",
- "QM_SETQLIM",
- "QM_DQCLUSTER",
- "QM_QINOCREATE",
- "QM_QUOTAOFF_END",
- "FSYNC_TS",
- "GROWFSRT_ALLOC",
- "GROWFSRT_ZERO",
- "GROWFSRT_FREE",
- "SWAPEXT",
- "CHECKPOINT",
- "ICREATE",
- "CREATE_TMPFILE"
+ TRANS_TYPE_STR(SETATTR_NOT_SIZE),
+ TRANS_TYPE_STR(SETATTR_SIZE),
+ TRANS_TYPE_STR(INACTIVE),
+ TRANS_TYPE_STR(CREATE),
+ TRANS_TYPE_STR(CREATE_TRUNC),
+ TRANS_TYPE_STR(TRUNCATE_FILE),
+ TRANS_TYPE_STR(REMOVE),
+ TRANS_TYPE_STR(LINK),
+ TRANS_TYPE_STR(RENAME),
+ TRANS_TYPE_STR(MKDIR),
+ TRANS_TYPE_STR(RMDIR),
+ TRANS_TYPE_STR(SYMLINK),
+ TRANS_TYPE_STR(SET_DMATTRS),
+ TRANS_TYPE_STR(GROWFS),
+ TRANS_TYPE_STR(STRAT_WRITE),
+ TRANS_TYPE_STR(DIOSTRAT),
+ TRANS_TYPE_STR(WRITEID),
+ TRANS_TYPE_STR(ADDAFORK),
+ TRANS_TYPE_STR(ATTRINVAL),
+ TRANS_TYPE_STR(ATRUNCATE),
+ TRANS_TYPE_STR(ATTR_SET),
+ TRANS_TYPE_STR(ATTR_RM),
+ TRANS_TYPE_STR(ATTR_FLAG),
+ TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
+ TRANS_TYPE_STR(SB_CHANGE),
+ TRANS_TYPE_STR(DUMMY1),
+ TRANS_TYPE_STR(DUMMY2),
+ TRANS_TYPE_STR(QM_QUOTAOFF),
+ TRANS_TYPE_STR(QM_DQALLOC),
+ TRANS_TYPE_STR(QM_SETQLIM),
+ TRANS_TYPE_STR(QM_DQCLUSTER),
+ TRANS_TYPE_STR(QM_QINOCREATE),
+ TRANS_TYPE_STR(QM_QUOTAOFF_END),
+ TRANS_TYPE_STR(FSYNC_TS),
+ TRANS_TYPE_STR(GROWFSRT_ALLOC),
+ TRANS_TYPE_STR(GROWFSRT_ZERO),
+ TRANS_TYPE_STR(GROWFSRT_FREE),
+ TRANS_TYPE_STR(SWAPEXT),
+ TRANS_TYPE_STR(CHECKPOINT),
+ TRANS_TYPE_STR(ICREATE),
+ TRANS_TYPE_STR(CREATE_TMPFILE)
};
+#undef TRANS_TYPE_STR
xfs_warn(mp, "xlog_write: reservation summary:");
xfs_warn(mp, " trans type = %s (%u)",
((ticket->t_trans_type <= 0 ||
ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
- "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+ "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
ticket->t_trans_type);
xfs_warn(mp, " unit res = %d bytes",
ticket->t_unit_res);
@@ -2101,7 +2101,7 @@ xlog_print_tic_res(
uint r_type = ticket->t_res_arr[i].r_type;
xfs_warn(mp, "region[%u]: %s - %u bytes", i,
((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type-1]),
+ "bad-rtype" : res_type_str[r_type]),
ticket->t_res_arr[i].r_len);
}
@@ -3979,7 +3979,7 @@ xfs_log_force_umount(
log->l_flags & XLOG_ACTIVE_RECOVERY) {
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
if (mp->m_sb_bp)
- XFS_BUF_DONE(mp->m_sb_bp);
+ mp->m_sb_bp->b_flags |= XBF_DONE;
return 0;
}
@@ -4009,7 +4009,7 @@ xfs_log_force_umount(
spin_lock(&log->l_icloglock);
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
if (mp->m_sb_bp)
- XFS_BUF_DONE(mp->m_sb_bp);
+ mp->m_sb_bp->b_flags |= XBF_DONE;
/*
* Mark the log and the iclogs with IO error flags to prevent any
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 594f7e63b432..396565f43247 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -190,7 +190,7 @@ xlog_bread_noalign(
ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- XFS_BUF_READ(bp);
+ bp->b_flags |= XBF_READ;
bp->b_io_length = nbblks;
bp->b_error = 0;
@@ -275,7 +275,6 @@ xlog_bwrite(
ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- XFS_BUF_ZEROFLAGS(bp);
xfs_buf_hold(bp);
xfs_buf_lock(bp);
bp->b_io_length = nbblks;
@@ -1109,27 +1108,10 @@ xlog_verify_head(
bool tmp_wrapped;
/*
- * Search backwards through the log looking for the log record header
- * block. This wraps all the way back around to the head so something is
- * seriously wrong if we can't find it.
- */
- found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
- rhead, wrapped);
- if (found < 0)
- return found;
- if (!found) {
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- return -EIO;
- }
-
- *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
-
- /*
- * Now that we have a tail block, check the head of the log for torn
- * writes. Search again until we hit the tail or the maximum number of
- * log record I/Os that could have been in flight at one time. Use a
- * temporary buffer so we don't trash the rhead/bp pointer from the
- * call above.
+ * Check the head of the log for torn writes. Search backwards from the
+ * head until we hit the tail or the maximum number of log record I/Os
+ * that could have been in flight at one time. Use a temporary buffer so
+ * we don't trash the rhead/bp pointers from the caller.
*/
tmp_bp = xlog_get_bp(log, 1);
if (!tmp_bp)
@@ -1216,6 +1198,115 @@ xlog_verify_head(
}
/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+ struct xlog *log,
+ xfs_daddr_t *head_blk,
+ xfs_daddr_t *tail_blk,
+ struct xlog_rec_header *rhead,
+ xfs_daddr_t rhead_blk,
+ struct xfs_buf *bp,
+ bool *clean)
+{
+ struct xlog_op_header *op_head;
+ xfs_daddr_t umount_data_blk;
+ xfs_daddr_t after_umount_blk;
+ int hblks;
+ int error;
+ char *offset;
+
+ *clean = false;
+
+ /*
+ * Look for unmount record. If we find it, then we know there was a
+ * clean unmount. Since 'i' could be the last block in the physical
+ * log, we convert to a log block before comparing to the head_blk.
+ *
+ * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+ * below. We won't want to clear the unmount record if there is one, so
+ * we pass the lsn of the unmount record rather than the block after it.
+ */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ int h_size = be32_to_cpu(rhead->h_size);
+ int h_version = be32_to_cpu(rhead->h_version);
+
+ if ((h_version & XLOG_VERSION_2) &&
+ (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+ hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+ if (h_size % XLOG_HEADER_CYCLE_SIZE)
+ hblks++;
+ } else {
+ hblks = 1;
+ }
+ } else {
+ hblks = 1;
+ }
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
+ if (*head_blk == after_umount_blk &&
+ be32_to_cpu(rhead->h_num_logops) == 1) {
+ umount_data_blk = rhead_blk + hblks;
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
+ error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ if (error)
+ return error;
+
+ op_head = (struct xlog_op_header *)offset;
+ if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+ /*
+ * Set tail and last sync so that newly written log
+ * records will point recovery to after the current
+ * unmount record.
+ */
+ xlog_assign_atomic_lsn(&log->l_tail_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ *tail_blk = after_umount_blk;
+
+ *clean = true;
+ }
+ }
+
+ return 0;
+}
+
+static void
+xlog_set_state(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ struct xlog_rec_header *rhead,
+ xfs_daddr_t rhead_blk,
+ bool bump_cycle)
+{
+ /*
+ * Reset log values according to the state of the log when we
+ * crashed. In the case where head_blk == 0, we bump curr_cycle
+ * one because the next write starts a new cycle rather than
+ * continuing the cycle of the last good log record. At this
+ * point we have guaranteed that all partial log records have been
+ * accounted for. Therefore, we know that the last good log record
+ * written was complete and ended exactly on the end boundary
+ * of the physical log.
+ */
+ log->l_prev_block = rhead_blk;
+ log->l_curr_block = (int)head_blk;
+ log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+ if (bump_cycle)
+ log->l_curr_cycle++;
+ atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+ atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+}
+
+/*
* Find the sync block number or the tail of the log.
*
* This will be the block number of the last record to have its
@@ -1238,22 +1329,20 @@ xlog_find_tail(
xfs_daddr_t *tail_blk)
{
xlog_rec_header_t *rhead;
- xlog_op_header_t *op_head;
char *offset = NULL;
xfs_buf_t *bp;
int error;
- xfs_daddr_t umount_data_blk;
- xfs_daddr_t after_umount_blk;
xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
- int hblks;
bool wrapped = false;
+ bool clean = false;
/*
* Find previous log record
*/
if ((error = xlog_find_head(log, head_blk)))
return error;
+ ASSERT(*head_blk < INT_MAX);
bp = xlog_get_bp(log, 1);
if (!bp)
@@ -1271,100 +1360,75 @@ xlog_find_tail(
}
/*
- * Trim the head block back to skip over torn records. We can have
- * multiple log I/Os in flight at any time, so we assume CRC failures
- * back through the previous several records are torn writes and skip
- * them.
+ * Search backwards through the log looking for the log record header
+ * block. This wraps all the way back around to the head so something is
+ * seriously wrong if we can't find it.
*/
- ASSERT(*head_blk < INT_MAX);
- error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
- &rhead, &wrapped);
- if (error)
- goto done;
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+ &rhead_blk, &rhead, &wrapped);
+ if (error < 0)
+ return error;
+ if (!error) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ return -EIO;
+ }
+ *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
/*
- * Reset log values according to the state of the log when we
- * crashed. In the case where head_blk == 0, we bump curr_cycle
- * one because the next write starts a new cycle rather than
- * continuing the cycle of the last good log record. At this
- * point we have guaranteed that all partial log records have been
- * accounted for. Therefore, we know that the last good log record
- * written was complete and ended exactly on the end boundary
- * of the physical log.
+ * Set the log state based on the current head record.
*/
- log->l_prev_block = rhead_blk;
- log->l_curr_block = (int)*head_blk;
- log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
- if (wrapped)
- log->l_curr_cycle++;
- atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
- atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
+ xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
/*
- * Look for unmount record. If we find it, then we know there
- * was a clean unmount. Since 'i' could be the last block in
- * the physical log, we convert to a log block before comparing
- * to the head_blk.
+ * Look for an unmount record at the head of the log. This sets the log
+ * state to determine whether recovery is necessary.
+ */
+ error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+ rhead_blk, bp, &clean);
+ if (error)
+ goto done;
+
+ /*
+ * Verify the log head if the log is not clean (e.g., we have anything
+ * but an unmount record at the head). This uses CRC verification to
+ * detect and trim torn writes. If discovered, CRC failures are
+ * considered torn writes and the log head is trimmed accordingly.
*
- * Save the current tail lsn to use to pass to
- * xlog_clear_stale_blocks() below. We won't want to clear the
- * unmount record if there is one, so we pass the lsn of the
- * unmount record rather than the block after it.
+ * Note that we can only run CRC verification when the log is dirty
+ * because there's no guarantee that the log data behind an unmount
+ * record is compatible with the current architecture.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- int h_size = be32_to_cpu(rhead->h_size);
- int h_version = be32_to_cpu(rhead->h_version);
+ if (!clean) {
+ xfs_daddr_t orig_head = *head_blk;
- if ((h_version & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
- } else {
- hblks = 1;
- }
- } else {
- hblks = 1;
- }
- after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
- after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
- tail_lsn = atomic64_read(&log->l_tail_lsn);
- if (*head_blk == after_umount_blk &&
- be32_to_cpu(rhead->h_num_logops) == 1) {
- umount_data_blk = rhead_blk + hblks;
- umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
- error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ error = xlog_verify_head(log, head_blk, tail_blk, bp,
+ &rhead_blk, &rhead, &wrapped);
if (error)
goto done;
- op_head = (xlog_op_header_t *)offset;
- if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
- /*
- * Set tail and last sync so that newly written
- * log records will point recovery to after the
- * current unmount record.
- */
- xlog_assign_atomic_lsn(&log->l_tail_lsn,
- log->l_curr_cycle, after_umount_blk);
- xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
- log->l_curr_cycle, after_umount_blk);
- *tail_blk = after_umount_blk;
-
- /*
- * Note that the unmount was clean. If the unmount
- * was not clean, we need to know this to rebuild the
- * superblock counters from the perag headers if we
- * have a filesystem using non-persistent counters.
- */
- log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+ /* update in-core state again if the head changed */
+ if (*head_blk != orig_head) {
+ xlog_set_state(log, *head_blk, rhead, rhead_blk,
+ wrapped);
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
+ error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+ rhead, rhead_blk, bp,
+ &clean);
+ if (error)
+ goto done;
}
}
/*
+ * Note that the unmount was clean. If the unmount was not clean, we
+ * need to know this to rebuild the superblock counters from the perag
+ * headers if we have a filesystem using non-persistent counters.
+ */
+ if (clean)
+ log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
+ /*
* Make sure that there are no blocks in front of the head
* with the same cycle number as the head. This can happen
* because we allow multiple outstanding log writes concurrently,
@@ -2473,6 +2537,13 @@ xlog_recover_validate_buf_type(
}
bp->b_ops = &xfs_sb_buf_ops;
break;
+#ifdef CONFIG_XFS_RT
+ case XFS_BLFT_RTBITMAP_BUF:
+ case XFS_BLFT_RTSUMMARY_BUF:
+ /* no magic numbers for verification of RT buffers */
+ bp->b_ops = &xfs_rtbuf_ops;
+ break;
+#endif /* CONFIG_XFS_RT */
default:
xfs_warn(mp, "Unknown buffer type %d!",
xfs_blft_from_flags(buf_f));
@@ -2793,7 +2864,7 @@ xfs_recover_inode_owner_change(
return -ENOMEM;
/* instantiate the inode */
- xfs_dinode_from_disk(&ip->i_d, dip);
+ xfs_inode_from_disk(ip, dip);
ASSERT(ip->i_d.di_version >= 3);
error = xfs_iformat_fork(ip, dip);
@@ -2839,7 +2910,7 @@ xlog_recover_inode_pass2(
int error;
int attr_index;
uint fields;
- xfs_icdinode_t *dicp;
+ struct xfs_log_dinode *ldip;
uint isize;
int need_free = 0;
@@ -2892,8 +2963,8 @@ xlog_recover_inode_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- dicp = item->ri_buf[1].i_addr;
- if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+ ldip = item->ri_buf[1].i_addr;
+ if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
__func__, item, in_f->ilf_ino);
@@ -2929,13 +3000,13 @@ xlog_recover_inode_pass2(
* to skip replay when the on disk inode is newer than the log one
*/
if (!xfs_sb_version_hascrc(&mp->m_sb) &&
- dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
*/
if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
trace_xfs_log_recover_inode_skip(log, in_f);
@@ -2945,13 +3016,13 @@ xlog_recover_inode_pass2(
}
/* Take the opportunity to reset the flush iteration count */
- dicp->di_flushiter = 0;
+ ldip->di_flushiter = 0;
- if (unlikely(S_ISREG(dicp->di_mode))) {
- if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+ if (unlikely(S_ISREG(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad regular inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -2959,12 +3030,12 @@ xlog_recover_inode_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- } else if (unlikely(S_ISDIR(dicp->di_mode))) {
- if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
- (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+ } else if (unlikely(S_ISDIR(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
+ (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad dir inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -2973,32 +3044,32 @@ xlog_recover_inode_pass2(
goto out_release;
}
}
- if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+ if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
__func__, item, dip, bp, in_f->ilf_ino,
- dicp->di_nextents + dicp->di_anextents,
- dicp->di_nblocks);
+ ldip->di_nextents + ldip->di_anextents,
+ ldip->di_nblocks);
error = -EFSCORRUPTED;
goto out_release;
}
- if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+ if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+ item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
- isize = xfs_icdinode_size(dicp->di_version);
+ isize = xfs_log_dinode_size(ldip->di_version);
if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr 0x%p",
__func__, item->ri_buf[1].i_len, item);
@@ -3006,8 +3077,8 @@ xlog_recover_inode_pass2(
goto out_release;
}
- /* The core is in in-core format */
- xfs_dinode_to_disk(dip, dicp);
+ /* recover the log dinode inode into the on disk inode */
+ xfs_log_dinode_to_disk(ldip, dip);
/* the rest is in on-disk format */
if (item->ri_buf[1].i_len > isize) {
@@ -4337,8 +4408,8 @@ xlog_recover_process_one_iunlink(
if (error)
goto fail_iput;
- ASSERT(ip->i_d.di_nlink == 0);
- ASSERT(ip->i_d.di_mode != 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
/* setup for the next pass */
agino = be32_to_cpu(dip->di_next_unlinked);
@@ -4892,6 +4963,7 @@ xlog_do_recover(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
+ struct xfs_mount *mp = log->l_mp;
int error;
xfs_buf_t *bp;
xfs_sb_t *sbp;
@@ -4906,7 +4978,7 @@ xlog_do_recover(
/*
* If IO errors happened during recovery, bail out.
*/
- if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ if (XFS_FORCED_SHUTDOWN(mp)) {
return -EIO;
}
@@ -4919,22 +4991,21 @@ xlog_do_recover(
* or iunlinks they will have some entries in the AIL; so we look at
* the AIL to determine how to set the tail_lsn.
*/
- xlog_assign_tail_lsn(log->l_mp);
+ xlog_assign_tail_lsn(mp);
/*
* Now that we've finished replaying all buffer and inode
* updates, re-read in the superblock and reverify it.
*/
- bp = xfs_getsb(log->l_mp, 0);
- XFS_BUF_UNDONE(bp);
- ASSERT(!(XFS_BUF_ISWRITE(bp)));
- XFS_BUF_READ(bp);
- XFS_BUF_UNASYNC(bp);
+ bp = xfs_getsb(mp, 0);
+ bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
+ ASSERT(!(bp->b_flags & XBF_WRITE));
+ bp->b_flags |= XBF_READ;
bp->b_ops = &xfs_sb_buf_ops;
error = xfs_buf_submit_wait(bp);
if (error) {
- if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_buf_ioerror_alert(bp, __func__);
ASSERT(0);
}
@@ -4943,14 +5014,17 @@ xlog_do_recover(
}
/* Convert superblock from on-disk format */
- sbp = &log->l_mp->m_sb;
+ sbp = &mp->m_sb;
xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
- ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
- ASSERT(xfs_sb_good_version(sbp));
- xfs_reinit_percpu_counters(log->l_mp);
-
xfs_buf_relse(bp);
+ /* re-initialise in-core superblock and geometry structures */
+ xfs_reinit_percpu_counters(mp);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ if (error) {
+ xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
+ return error;
+ }
xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb753b359bee..536a0ee9cd5a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -185,9 +185,6 @@ xfs_initialize_perag(
xfs_agnumber_t index;
xfs_agnumber_t first_initialised = 0;
xfs_perag_t *pag;
- xfs_agino_t agino;
- xfs_ino_t ino;
- xfs_sb_t *sbp = &mp->m_sb;
int error = -ENOMEM;
/*
@@ -230,22 +227,7 @@ xfs_initialize_perag(
radix_tree_preload_end();
}
- /*
- * If we mount with the inode64 option, or no inode overflows
- * the legacy 32-bit address space clear the inode32 option.
- */
- agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
- ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
- mp->m_flags |= XFS_MOUNT_32BITINODES;
- else
- mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-
- if (mp->m_flags & XFS_MOUNT_32BITINODES)
- index = xfs_set_inode32(mp, agcount);
- else
- index = xfs_set_inode64(mp, agcount);
+ index = xfs_set_inode_alloc(mp, agcount);
if (maxagi)
*maxagi = index;
@@ -865,7 +847,7 @@ xfs_mountfs(
ASSERT(rip != NULL);
- if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
+ if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
xfs_warn(mp, "corrupted root inode %llu: not a directory",
(unsigned long long)rip->i_ino);
xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1284,7 +1266,7 @@ xfs_getsb(
}
xfs_buf_hold(bp);
- ASSERT(XFS_BUF_ISDONE(bp));
+ ASSERT(bp->b_flags & XBF_DONE);
return bp;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..bac6b3435591 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,6 +147,17 @@ typedef struct xfs_mount {
* to various other kinds of pain inflicted on the pNFS server.
*/
__uint32_t m_generation;
+
+#ifdef DEBUG
+ /*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are forced to fail. All delalloc blocks in the range
+ * of the write (including pre-existing delalloc blocks!) are tossed as
+ * part of the write failure error handling sequence.
+ */
+ bool m_fail_writes;
+#endif
} xfs_mount_t;
/*
@@ -166,9 +177,8 @@ typedef struct xfs_mount {
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
-#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
- * 32 bits in size */
-#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
+#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
+#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
#define XFS_MOUNT_BARRIER (1ULL << 17)
#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
@@ -264,6 +274,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
+#ifdef DEBUG
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+ return mp->m_fail_writes;
+}
+#else
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+ return 0;
+}
+#endif
+
/*
* Per-ag incore structure, copies of information in agf and agi, to improve the
* performance of allocation group selection.
@@ -327,7 +351,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern int xfs_mount_log_sb(xfs_mount_t *);
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
new file mode 100644
index 000000000000..184c44effdd5
--- /dev/null
+++ b/fs/xfs/xfs_ondisk.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Oracle.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ONDISK_H
+#define __XFS_ONDISK_H
+
+#define XFS_CHECK_STRUCT_SIZE(structname, size) \
+ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
+ #structname ") is wrong, expected " #size)
+
+static inline void __init
+xfs_check_ondisk_structs(void)
+{
+ /* ag/file structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
+
+ /* dir/attr trees */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr, 48);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+
+ /*
+ * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
+ * 4 bytes anyway so it's not obviously a problem. Hence for the moment
+ * we don't check this structure. This can be re-instated when the attr
+ * definitions are updated to use c99 VLA definitions.
+ *
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
+ */
+
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
+
+ /* log structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+}
+
+#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 8147ac108820..93f74853961b 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
-#ifdef CONFIG_NFSD_PNFS
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
struct iomap *iomap, bool write, u32 *device_generation);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 532ab79d38fe..be125e1758c1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -560,6 +560,37 @@ xfs_qm_shrink_count(
return list_lru_shrink_count(&qi->qi_lru, sc);
}
+STATIC void
+xfs_qm_set_defquota(
+ xfs_mount_t *mp,
+ uint type,
+ xfs_quotainfo_t *qinf)
+{
+ xfs_dquot_t *dqp;
+ struct xfs_def_quota *defq;
+ int error;
+
+ error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp);
+
+ if (!error) {
+ xfs_disk_dquot_t *ddqp = &dqp->q_core;
+
+ defq = xfs_get_defquota(dqp, qinf);
+
+ /*
+ * Timers and warnings have been already set, let's just set the
+ * default limits for this quota type
+ */
+ defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ xfs_qm_dqdestroy(dqp);
+ }
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -606,19 +637,19 @@ xfs_qm_init_quotainfo(
* We try to get the limits from the superuser's limits fields.
* This is quite hacky, but it is standard quota practice.
*
- * We look at the USR dquot with id == 0 first, but if user quotas
- * are not enabled we goto the GRP dquot with id == 0.
- * We don't really care to keep separate default limits for user
- * and group quotas, at least not at this point.
- *
* Since we may not have done a quotacheck by this point, just read
* the dquot without attaching it to any hashtables or lists.
+ *
+ * Timers and warnings are globally set by the first timer found in
+ * user/group/proj quota types, otherwise a default value is used.
+ * This should be split into different fields per quota type.
*/
error = xfs_qm_dqread(mp, 0,
XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
(XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
XFS_DQ_PROJ),
XFS_QMOPT_DOWARN, &dqp);
+
if (!error) {
xfs_disk_dquot_t *ddqp = &dqp->q_core;
@@ -639,13 +670,6 @@ xfs_qm_init_quotainfo(
be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
- qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -656,6 +680,13 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
+ if (XFS_IS_UQUOTA_RUNNING(mp))
+ xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
+ if (XFS_IS_GQUOTA_RUNNING(mp))
+ xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
+ if (XFS_IS_PQUOTA_RUNNING(mp))
+ xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
+
qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 996a04064894..2975a822e9f0 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -53,6 +53,15 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
*/
#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+struct xfs_def_quota {
+ xfs_qcnt_t bhardlimit; /* default data blk hard limit */
+ xfs_qcnt_t bsoftlimit; /* default data blk soft limit */
+ xfs_qcnt_t ihardlimit; /* default inode count hard limit */
+ xfs_qcnt_t isoftlimit; /* default inode count soft limit */
+ xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */
+ xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */
+};
+
/*
* Various quota information for individual filesystems.
* The mount structure keeps a pointer to this.
@@ -76,12 +85,9 @@ typedef struct xfs_quotainfo {
struct mutex qi_quotaofflock;/* to serialize quotaoff */
xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
uint qi_dqperchunk; /* # ondisk dqs in above chunk */
- xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
- xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */
- xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */
- xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
- xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
- xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
+ struct xfs_def_quota qi_usr_default;
+ struct xfs_def_quota qi_grp_default;
+ struct xfs_def_quota qi_prj_default;
struct shrinker qi_shrinker;
} xfs_quotainfo_t;
@@ -104,15 +110,15 @@ xfs_dquot_tree(
}
static inline struct xfs_inode *
-xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
{
- switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+ switch (dq_flags & XFS_DQ_ALLTYPES) {
case XFS_DQ_USER:
- return dqp->q_mount->m_quotainfo->qi_uquotaip;
+ return mp->m_quotainfo->qi_uquotaip;
case XFS_DQ_GROUP:
- return dqp->q_mount->m_quotainfo->qi_gquotaip;
+ return mp->m_quotainfo->qi_gquotaip;
case XFS_DQ_PROJ:
- return dqp->q_mount->m_quotainfo->qi_pquotaip;
+ return mp->m_quotainfo->qi_pquotaip;
default:
ASSERT(0);
}
@@ -164,11 +170,27 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
- uint, struct qc_dqblk *);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
+ uint, struct qc_dqblk *, uint);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
struct qc_dqblk *);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
+static inline struct xfs_def_quota *
+xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
+{
+ struct xfs_def_quota *defq;
+
+ if (XFS_QM_ISUDQ(dqp))
+ defq = &qi->qi_usr_default;
+ else if (XFS_QM_ISGDQ(dqp))
+ defq = &qi->qi_grp_default;
+ else {
+ ASSERT(XFS_QM_ISPDQ(dqp));
+ defq = &qi->qi_prj_default;
+ }
+ return defq;
+}
+
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3640c6e896af..f4d0e0a8f517 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -404,6 +404,7 @@ xfs_qm_scall_setqlim(
struct xfs_disk_dquot *ddq;
struct xfs_dquot *dqp;
struct xfs_trans *tp;
+ struct xfs_def_quota *defq;
int error;
xfs_qcnt_t hard, soft;
@@ -431,6 +432,8 @@ xfs_qm_scall_setqlim(
ASSERT(error != -ENOENT);
goto out_unlock;
}
+
+ defq = xfs_get_defquota(dqp, q);
xfs_dqunlock(dqp);
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
@@ -458,8 +461,8 @@ xfs_qm_scall_setqlim(
ddq->d_blk_softlimit = cpu_to_be64(soft);
xfs_dquot_set_prealloc_limits(dqp);
if (id == 0) {
- q->qi_bhardlimit = hard;
- q->qi_bsoftlimit = soft;
+ defq->bhardlimit = hard;
+ defq->bsoftlimit = soft;
}
} else {
xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
@@ -474,8 +477,8 @@ xfs_qm_scall_setqlim(
ddq->d_rtb_hardlimit = cpu_to_be64(hard);
ddq->d_rtb_softlimit = cpu_to_be64(soft);
if (id == 0) {
- q->qi_rtbhardlimit = hard;
- q->qi_rtbsoftlimit = soft;
+ defq->rtbhardlimit = hard;
+ defq->rtbsoftlimit = soft;
}
} else {
xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
@@ -491,8 +494,8 @@ xfs_qm_scall_setqlim(
ddq->d_ino_hardlimit = cpu_to_be64(hard);
ddq->d_ino_softlimit = cpu_to_be64(soft);
if (id == 0) {
- q->qi_ihardlimit = hard;
- q->qi_isoftlimit = soft;
+ defq->ihardlimit = hard;
+ defq->isoftlimit = soft;
}
} else {
xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
@@ -635,9 +638,10 @@ out:
int
xfs_qm_scall_getquota(
struct xfs_mount *mp,
- xfs_dqid_t id,
+ xfs_dqid_t *id,
uint type,
- struct qc_dqblk *dst)
+ struct qc_dqblk *dst,
+ uint dqget_flags)
{
struct xfs_dquot *dqp;
int error;
@@ -647,7 +651,7 @@ xfs_qm_scall_getquota(
* we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
* exist, we'll get ENOENT back.
*/
- error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+ error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
if (error)
return error;
@@ -660,6 +664,9 @@ xfs_qm_scall_getquota(
goto out_put;
}
+ /* Fill in the ID we actually read from disk */
+ *id = be32_to_cpu(dqp->q_core.d_id);
+
memset(dst, 0, sizeof(*dst));
dst->d_spc_hardlimit =
XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -701,7 +708,7 @@ xfs_qm_scall_getquota(
if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
(XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
(XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
- id != 0) {
+ *id != 0) {
if ((dst->d_space > dst->d_spc_softlimit) &&
(dst->d_spc_softlimit > 0)) {
ASSERT(dst->d_spc_timer != 0);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7795e0d01382..f82d79a8c694 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -231,14 +231,45 @@ xfs_fs_get_dqblk(
struct qc_dqblk *qdq)
{
struct xfs_mount *mp = XFS_M(sb);
+ xfs_dqid_t id;
if (!XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
- xfs_quota_type(qid.type), qdq);
+ id = from_kqid(&init_user_ns, qid);
+ return xfs_qm_scall_getquota(mp, &id,
+ xfs_quota_type(qid.type), qdq, 0);
+}
+
+/* Return quota info for active quota >= this qid */
+STATIC int
+xfs_fs_get_nextdqblk(
+ struct super_block *sb,
+ struct kqid *qid,
+ struct qc_dqblk *qdq)
+{
+ int ret;
+ struct xfs_mount *mp = XFS_M(sb);
+ xfs_dqid_t id;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+
+ id = from_kqid(&init_user_ns, *qid);
+ ret = xfs_qm_scall_getquota(mp, &id,
+ xfs_quota_type(qid->type), qdq,
+ XFS_QMOPT_DQNEXT);
+ if (ret)
+ return ret;
+
+ /* ID may be different, so convert back what we got */
+ *qid = make_kqid(current_user_ns(), qid->type, id);
+ return 0;
+
}
STATIC int
@@ -267,5 +298,6 @@ const struct quotactl_ops xfs_quotactl_operations = {
.quota_disable = xfs_quota_disable,
.rm_xquota = xfs_fs_rm_xquota,
.get_dqblk = xfs_fs_get_dqblk,
+ .get_nextdqblk = xfs_fs_get_nextdqblk,
.set_dqblk = xfs_fs_set_dqblk,
};
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index be02a68b2fe2..abf44435d04a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1272,7 +1272,7 @@ xfs_rtpick_extent(
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+ seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
*seqp = 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 59c9b7bd958d..d760934109b5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
#include "xfs_filestream.h"
#include "xfs_quota.h"
#include "xfs_sysfs.h"
+#include "xfs_ondisk.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -65,83 +66,85 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
-#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
-#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
-#define MNTOPT_LOGDEV "logdev" /* log device */
-#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
-#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
-#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
-#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
-#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
-#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
-#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */
-#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */
-#define MNTOPT_MTPT "mtpt" /* filesystem mount point */
-#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */
-#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */
-#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */
-#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */
-#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */
-#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
-#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
- * unwritten extent conversion */
-#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
-#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
-#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to
- * XFS_MAXINUMBER_32 */
-#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
-#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
-#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
-#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes
- * in stat(). */
-#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */
-#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */
-#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */
-#define MNTOPT_QUOTA "quota" /* disk quotas (user) */
-#define MNTOPT_NOQUOTA "noquota" /* no quotas */
-#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */
-#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */
-#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */
-#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
-#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
-
-#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
-
/*
* Table driven mount option parser.
- *
- * Currently only used for remount, but it will be used for mount
- * in the future, too.
*/
enum {
- Opt_barrier,
- Opt_nobarrier,
- Opt_inode64,
- Opt_inode32,
- Opt_err
+ Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
+ Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
+ Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+ Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier,
+ Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep,
+ Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams,
+ Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
+ Opt_uquota, Opt_gquota, Opt_pquota,
+ Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
};
static const match_table_t tokens = {
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_inode64, "inode64"},
- {Opt_inode32, "inode32"},
- {Opt_err, NULL}
+ {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */
+ {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */
+ {Opt_logdev, "logdev=%s"}, /* log device */
+ {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */
+ {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */
+ {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */
+ {Opt_noalign, "noalign"}, /* turn off stripe alignment */
+ {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */
+ {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
+ {Opt_swidth, "swidth=%u"}, /* data volume stripe width */
+ {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
+ {Opt_mtpt, "mtpt"}, /* filesystem mount point */
+ {Opt_grpid, "grpid"}, /* group-ID from parent directory */
+ {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
+ {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
+ {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
+ {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
+ {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
+ {Opt_barrier, "barrier"}, /* use writer barriers for log write and
+ * unwritten extent conversion */
+ {Opt_nobarrier, "nobarrier"}, /* .. disable */
+ {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
+ {Opt_inode32, "inode32"}, /* inode allocation limited to
+ * XFS_MAXINUMBER_32 */
+ {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */
+ {Opt_noikeep, "noikeep"}, /* free empty inode clusters */
+ {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */
+ {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes
+ * in stat(). */
+ {Opt_attr2, "attr2"}, /* do use attr2 attribute format */
+ {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */
+ {Opt_filestreams,"filestreams"},/* use filestreams allocator */
+ {Opt_quota, "quota"}, /* disk quotas (user) */
+ {Opt_noquota, "noquota"}, /* no quotas */
+ {Opt_usrquota, "usrquota"}, /* user quota enabled */
+ {Opt_grpquota, "grpquota"}, /* group quota enabled */
+ {Opt_prjquota, "prjquota"}, /* project quota enabled */
+ {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */
+ {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */
+ {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */
+ {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
+ {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
+ {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
+ {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */
+ {Opt_discard, "discard"}, /* Discard unused blocks */
+ {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
+
+ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */
+ {Opt_err, NULL},
};
STATIC int
-suffix_kstrtoint(char *s, unsigned int base, int *res)
+suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
{
int last, shift_left_factor = 0, _res;
- char *value = s;
+ char *value;
+ int ret = 0;
+
+ value = match_strdup(s);
+ if (!value)
+ return -ENOMEM;
last = strlen(value) - 1;
if (value[last] == 'K' || value[last] == 'k') {
@@ -157,10 +160,11 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
value[last] = '\0';
}
- if (kstrtoint(s, base, &_res))
- return -EINVAL;
+ if (kstrtoint(value, base, &_res))
+ ret = -EINVAL;
+ kfree(value);
*res = _res << shift_left_factor;
- return 0;
+ return ret;
}
/*
@@ -169,14 +173,19 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
*
* Note that this function leaks the various device name allocations on
* failure. The caller takes care of them.
+ *
+ * *sb is const because this is also used to test options on the remount
+ * path, and we don't want this to have any side effects at remount time.
+ * Today this function does not change *sb, but just to future-proof...
*/
STATIC int
xfs_parseargs(
struct xfs_mount *mp,
char *options)
{
- struct super_block *sb = mp->m_super;
- char *this_char, *value;
+ const struct super_block *sb = mp->m_super;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
int dsunit = 0;
int dswidth = 0;
int iosize = 0;
@@ -217,152 +226,152 @@ xfs_parseargs(
if (!options)
goto done;
- while ((this_char = strsep(&options, ",")) != NULL) {
- if (!*this_char)
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
continue;
- if ((value = strchr(this_char, '=')) != NULL)
- *value++ = 0;
- if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &mp->m_logbufs))
- return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_logbufs:
+ if (match_int(args, &mp->m_logbufs))
return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
+ break;
+ case Opt_logbsize:
+ if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
return -EINVAL;
- }
- mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ break;
+ case Opt_logdev:
+ mp->m_logname = match_strdup(args);
if (!mp->m_logname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_MTPT)) {
- xfs_warn(mp, "%s option not allowed on this system",
- this_char);
+ break;
+ case Opt_mtpt:
+ xfs_warn(mp, "%s option not allowed on this system", p);
return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ case Opt_rtdev:
+ mp->m_rtname = match_strdup(args);
if (!mp->m_rtname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
- !strcmp(this_char, MNTOPT_BIOSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (suffix_kstrtoint(value, 10, &iosize))
+ break;
+ case Opt_allocsize:
+ case Opt_biosize:
+ if (suffix_kstrtoint(args, 10, &iosize))
return -EINVAL;
iosizelog = ffs(iosize) - 1;
- } else if (!strcmp(this_char, MNTOPT_GRPID) ||
- !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+ break;
+ case Opt_grpid:
+ case Opt_bsdgroups:
mp->m_flags |= XFS_MOUNT_GRPID;
- } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
- !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+ break;
+ case Opt_nogrpid:
+ case Opt_sysvgroups:
mp->m_flags &= ~XFS_MOUNT_GRPID;
- } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+ break;
+ case Opt_wsync:
mp->m_flags |= XFS_MOUNT_WSYNC;
- } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+ break;
+ case Opt_norecovery:
mp->m_flags |= XFS_MOUNT_NORECOVERY;
- } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+ break;
+ case Opt_noalign:
mp->m_flags |= XFS_MOUNT_NOALIGN;
- } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+ break;
+ case Opt_swalloc:
mp->m_flags |= XFS_MOUNT_SWALLOC;
- } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &dsunit))
- return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
+ break;
+ case Opt_sunit:
+ if (match_int(args, &dsunit))
return -EINVAL;
- }
- if (kstrtoint(value, 10, &dswidth))
+ break;
+ case Opt_swidth:
+ if (match_int(args, &dswidth))
return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+ break;
+ case Opt_inode32:
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+ break;
+ case Opt_inode64:
mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+ break;
+ case Opt_nouuid:
mp->m_flags |= XFS_MOUNT_NOUUID;
- } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+ break;
+ case Opt_barrier:
mp->m_flags |= XFS_MOUNT_BARRIER;
- } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+ break;
+ case Opt_nobarrier:
mp->m_flags &= ~XFS_MOUNT_BARRIER;
- } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+ break;
+ case Opt_ikeep:
mp->m_flags |= XFS_MOUNT_IKEEP;
- } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+ break;
+ case Opt_noikeep:
mp->m_flags &= ~XFS_MOUNT_IKEEP;
- } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+ break;
+ case Opt_largeio:
mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
- } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+ break;
+ case Opt_nolargeio:
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
- } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+ break;
+ case Opt_attr2:
mp->m_flags |= XFS_MOUNT_ATTR2;
- } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+ break;
+ case Opt_noattr2:
mp->m_flags &= ~XFS_MOUNT_ATTR2;
mp->m_flags |= XFS_MOUNT_NOATTR2;
- } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+ break;
+ case Opt_filestreams:
mp->m_flags |= XFS_MOUNT_FILESTREAMS;
- } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+ break;
+ case Opt_noquota:
mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
- } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
- !strcmp(this_char, MNTOPT_UQUOTA) ||
- !strcmp(this_char, MNTOPT_USRQUOTA)) {
+ break;
+ case Opt_quota:
+ case Opt_uquota:
+ case Opt_usrquota:
mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
XFS_UQUOTA_ENFD);
- } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
- !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+ break;
+ case Opt_qnoenforce:
+ case Opt_uqnoenforce:
mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_UQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
- !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+ break;
+ case Opt_pquota:
+ case Opt_prjquota:
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
XFS_PQUOTA_ENFD);
- } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+ break;
+ case Opt_pqnoenforce:
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_PQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
- !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+ case Opt_gquota:
+ case Opt_grpquota:
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
XFS_GQUOTA_ENFD);
- } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+ break;
+ case Opt_gqnoenforce:
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_GQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+ break;
+ case Opt_discard:
mp->m_flags |= XFS_MOUNT_DISCARD;
- } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+ break;
+ case Opt_nodiscard:
mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ break;
#ifdef CONFIG_FS_DAX
- } else if (!strcmp(this_char, MNTOPT_DAX)) {
+ case Opt_dax:
mp->m_flags |= XFS_MOUNT_DAX;
+ break;
#endif
- } else {
- xfs_warn(mp, "unknown mount option [%s].", this_char);
+ default:
+ xfs_warn(mp, "unknown mount option [%s].", p);
return -EINVAL;
}
}
@@ -461,25 +470,25 @@ xfs_showargs(
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
- { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
- { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
- { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
- { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
- { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
- { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
- { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
- { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
- { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
- { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
- { XFS_MOUNT_DAX, "," MNTOPT_DAX },
+ { XFS_MOUNT_IKEEP, ",ikeep" },
+ { XFS_MOUNT_WSYNC, ",wsync" },
+ { XFS_MOUNT_NOALIGN, ",noalign" },
+ { XFS_MOUNT_SWALLOC, ",swalloc" },
+ { XFS_MOUNT_NOUUID, ",nouuid" },
+ { XFS_MOUNT_NORECOVERY, ",norecovery" },
+ { XFS_MOUNT_ATTR2, ",attr2" },
+ { XFS_MOUNT_FILESTREAMS, ",filestreams" },
+ { XFS_MOUNT_GRPID, ",grpid" },
+ { XFS_MOUNT_DISCARD, ",discard" },
+ { XFS_MOUNT_SMALL_INUMS, ",inode32" },
+ { XFS_MOUNT_DAX, ",dax" },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO },
- { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER },
- { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE },
+ { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" },
+ { XFS_MOUNT_BARRIER, ",nobarrier" },
+ { XFS_MOUNT_SMALL_INUMS, ",inode64" },
{ 0, NULL }
};
struct proc_xfs_info *xfs_infop;
@@ -494,46 +503,46 @@ xfs_showargs(
}
if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
- seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+ seq_printf(m, ",allocsize=%dk",
(int)(1 << mp->m_writeio_log) >> 10);
if (mp->m_logbufs > 0)
- seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+ seq_printf(m, ",logbufs=%d", mp->m_logbufs);
if (mp->m_logbsize > 0)
- seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+ seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);
if (mp->m_logname)
- seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
+ seq_show_option(m, "logdev", mp->m_logname);
if (mp->m_rtname)
- seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
+ seq_show_option(m, "rtdev", mp->m_rtname);
if (mp->m_dalign > 0)
- seq_printf(m, "," MNTOPT_SUNIT "=%d",
+ seq_printf(m, ",sunit=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
if (mp->m_swidth > 0)
- seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+ seq_printf(m, ",swidth=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_USRQUOTA);
+ seq_puts(m, ",usrquota");
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_UQUOTANOENF);
+ seq_puts(m, ",uqnoenforce");
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
if (mp->m_qflags & XFS_PQUOTA_ENFD)
- seq_puts(m, "," MNTOPT_PRJQUOTA);
+ seq_puts(m, ",prjquota");
else
- seq_puts(m, "," MNTOPT_PQUOTANOENF);
+ seq_puts(m, ",pqnoenforce");
}
if (mp->m_qflags & XFS_GQUOTA_ACCT) {
if (mp->m_qflags & XFS_GQUOTA_ENFD)
- seq_puts(m, "," MNTOPT_GRPQUOTA);
+ seq_puts(m, ",grpquota");
else
- seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ seq_puts(m, ",gqnoenforce");
}
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
- seq_puts(m, "," MNTOPT_NOQUOTA);
+ seq_puts(m, ",noquota");
return 0;
}
@@ -572,23 +581,35 @@ xfs_max_file_offset(
}
/*
- * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
- * because in the growfs case, mp->m_sb.sb_agcount is not updated
- * yet to the potentially higher ag count.
+ * Set parameters for inode allocation heuristics, taking into account
+ * filesystem size and inode32/inode64 mount options; i.e. specifically
+ * whether or not XFS_MOUNT_SMALL_INUMS is set.
+ *
+ * Inode allocation patterns are altered only if inode32 is requested
+ * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_MOUNT_32BITINODES is set as well.
+ *
+ * An agcount independent of that in the mount structure is provided
+ * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
+ * to the potentially higher ag count.
+ *
+ * Returns the maximum AG index which may contain inodes.
*/
xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
+xfs_set_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount)
{
- xfs_agnumber_t index = 0;
+ xfs_agnumber_t index;
xfs_agnumber_t maxagi = 0;
xfs_sb_t *sbp = &mp->m_sb;
xfs_agnumber_t max_metadata;
xfs_agino_t agino;
xfs_ino_t ino;
- xfs_perag_t *pag;
- /* Calculate how much should be reserved for inodes to meet
- * the max inode percentage.
+ /*
+ * Calculate how much should be reserved for inodes to meet
+ * the max inode percentage. Used only for inode32.
*/
if (mp->m_maxicount) {
__uint64_t icount;
@@ -602,54 +623,48 @@ xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
max_metadata = agcount;
}
+ /* Get the last possible inode in the filesystem */
agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+ /*
+ * If user asked for no more than 32-bit inodes, and the fs is
+ * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
+ * the allocator to accommodate the request.
+ */
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+ mp->m_flags |= XFS_MOUNT_32BITINODES;
+ else
+ mp->m_flags &= ~XFS_MOUNT_32BITINODES;
for (index = 0; index < agcount; index++) {
- ino = XFS_AGINO_TO_INO(mp, index, agino);
+ struct xfs_perag *pag;
- if (ino > XFS_MAXINUMBER_32) {
- pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 0;
- pag->pagf_metadata = 0;
- xfs_perag_put(pag);
- continue;
- }
+ ino = XFS_AGINO_TO_INO(mp, index, agino);
pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 1;
- maxagi++;
- if (index < max_metadata)
- pag->pagf_metadata = 1;
- xfs_perag_put(pag);
- }
- mp->m_flags |= (XFS_MOUNT_32BITINODES |
- XFS_MOUNT_SMALL_INUMS);
- return maxagi;
-}
-
-xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
-{
- xfs_agnumber_t index = 0;
-
- for (index = 0; index < agcount; index++) {
- struct xfs_perag *pag;
+ if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ if (ino > XFS_MAXINUMBER_32) {
+ pag->pagi_inodeok = 0;
+ pag->pagf_metadata = 0;
+ } else {
+ pag->pagi_inodeok = 1;
+ maxagi++;
+ if (index < max_metadata)
+ pag->pagf_metadata = 1;
+ else
+ pag->pagf_metadata = 0;
+ }
+ } else {
+ pag->pagi_inodeok = 1;
+ pag->pagf_metadata = 0;
+ }
- pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 1;
- pag->pagf_metadata = 0;
xfs_perag_put(pag);
}
- /* There is no need for lock protection on m_flags,
- * the rw_semaphore of the VFS superblock is locked
- * during mount/umount/remount operations, so this is
- * enough to avoid concurency on the m_flags field
- */
- mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
- XFS_MOUNT_SMALL_INUMS);
- return index;
+ return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
}
STATIC int
@@ -1166,6 +1181,27 @@ xfs_quiesce_attr(
}
STATIC int
+xfs_test_remount_options(
+ struct super_block *sb,
+ struct xfs_mount *mp,
+ char *options)
+{
+ int error = 0;
+ struct xfs_mount *tmp_mp;
+
+ tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
+ if (!tmp_mp)
+ return -ENOMEM;
+
+ tmp_mp->m_super = sb;
+ error = xfs_parseargs(tmp_mp, options);
+ xfs_free_fsname(tmp_mp);
+ kfree(tmp_mp);
+
+ return error;
+}
+
+STATIC int
xfs_fs_remount(
struct super_block *sb,
int *flags,
@@ -1177,6 +1213,11 @@ xfs_fs_remount(
char *p;
int error;
+ /* First, check for complete junk; i.e. invalid options */
+ error = xfs_test_remount_options(sb, mp, options);
+ if (error)
+ return error;
+
sync_filesystem(sb);
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -1193,10 +1234,12 @@ xfs_fs_remount(
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
case Opt_inode64:
- mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
break;
case Opt_inode32:
- mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
break;
default:
/*
@@ -1344,9 +1387,8 @@ xfs_finish_flags(
*/
if (xfs_sb_version_hascrc(&mp->m_sb) &&
(mp->m_flags & XFS_MOUNT_NOATTR2)) {
- xfs_warn(mp,
-"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
- MNTOPT_NOATTR2, MNTOPT_ATTR2);
+ xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
+ "attr2 is always enabled for V5 filesystems.");
return -EINVAL;
}
@@ -1817,6 +1859,8 @@ init_xfs_fs(void)
{
int error;
+ xfs_check_ondisk_structs();
+
printk(KERN_INFO XFS_VERSION_STRING " with "
XFS_BUILD_OPTIONS " enabled\n");
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 499058fea303..2dfb1ce4585f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -65,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
+ xfs_agnumber_t agcount);
extern const struct export_operations xfs_export_operations;
extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 641d625eb334..6ced4f143494 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -18,10 +18,13 @@
#include "xfs.h"
#include "xfs_sysfs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_stats.h"
+#include "xfs_mount.h"
struct xfs_sysfs_attr {
struct attribute attr;
@@ -45,16 +48,6 @@ to_attr(struct attribute *attr)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
-/*
- * xfs_mount kobject. This currently has no attributes and thus no need for show
- * and store helpers. The mp kobject serves as the per-mount parent object that
- * is identified by the fsname under sysfs.
- */
-
-struct kobj_type xfs_mp_ktype = {
- .release = xfs_sysfs_release,
-};
-
STATIC ssize_t
xfs_sysfs_object_show(
struct kobject *kobject,
@@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = {
.store = xfs_sysfs_object_store,
};
+/*
+ * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
+ * that is identified by the fsname under sysfs.
+ */
+
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xfs_mount, m_kobj);
+}
+
+#ifdef DEBUG
+
+STATIC ssize_t
+fail_writes_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_mount *mp = to_mp(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val == 1)
+ mp->m_fail_writes = true;
+ else if (val == 0)
+ mp->m_fail_writes = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+STATIC ssize_t
+fail_writes_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_mount *mp = to_mp(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+}
+XFS_SYSFS_ATTR_RW(fail_writes);
+
+#endif /* DEBUG */
+
+static struct attribute *xfs_mp_attrs[] = {
+#ifdef DEBUG
+ ATTR_LIST(fail_writes),
+#endif
+ NULL,
+};
+
+struct kobj_type xfs_mp_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_mp_attrs,
+};
+
#ifdef DEBUG
/* debug */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797cb53f..c8d58426008e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 748b16aff45a..20c53666cb4b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1028,6 +1028,8 @@ __xfs_trans_roll(
struct xfs_trans_res tres;
int error;
+ *committed = 0;
+
/*
* Ensure that the inode is always logged.
*/
@@ -1082,6 +1084,6 @@ xfs_trans_roll(
struct xfs_trans **tpp,
struct xfs_inode *dp)
{
- int committed = 0;
+ int committed;
return __xfs_trans_roll(tpp, dp, &committed);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 4643070d7cae..e7c49cf43fbc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,7 +133,6 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces that are
* actually macros.
*/
-#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
#if defined(DEBUG) || defined(XFS_WARN)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4f18fd92ca13..d6c9c3e9e02b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -497,6 +497,7 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
+ set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)
@@ -519,14 +520,14 @@ xfsaild(
if (!xfs_ail_min(ailp) &&
ailp->xa_target == ailp->xa_target_prev) {
spin_unlock(&ailp->xa_lock);
- schedule();
+ freezable_schedule();
tout = 0;
continue;
}
spin_unlock(&ailp->xa_lock);
if (tout)
- schedule_timeout(msecs_to_jiffies(tout));
+ freezable_schedule_timeout(msecs_to_jiffies(tout));
__set_current_state(TASK_RUNNING);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 75798412859a..8ee29ca132dc 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -155,7 +155,7 @@ xfs_trans_get_buf_map(
ASSERT(xfs_buf_islocked(bp));
if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
}
ASSERT(bp->b_transp == tp);
@@ -518,7 +518,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
* inside the b_bdstrat callback so that this won't get written to
* disk.
*/
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bp->b_iodone = xfs_buf_iodone_callbacks;
@@ -534,8 +534,8 @@ xfs_trans_log_buf(xfs_trans_t *tp,
*/
if (bip->bli_flags & XFS_BLI_STALE) {
bip->bli_flags &= ~XFS_BLI_STALE;
- ASSERT(XFS_BUF_ISSTALE(bp));
- XFS_BUF_UNSTALE(bp);
+ ASSERT(bp->b_flags & XBF_STALE);
+ bp->b_flags &= ~XBF_STALE;
bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
@@ -600,7 +600,7 @@ xfs_trans_binval(
* If the buffer is already invalidated, then
* just return.
*/
- ASSERT(XFS_BUF_ISSTALE(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 995170194df0..c3d547211d16 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -609,17 +609,20 @@ xfs_trans_dqresv(
xfs_qcnt_t total_count;
xfs_qcnt_t *resbcountp;
xfs_quotainfo_t *q = mp->m_quotainfo;
+ struct xfs_def_quota *defq;
xfs_dqlock(dqp);
+ defq = xfs_get_defquota(dqp, q);
+
if (flags & XFS_TRANS_DQ_RES_BLKS) {
hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
if (!hardlimit)
- hardlimit = q->qi_bhardlimit;
+ hardlimit = defq->bhardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
if (!softlimit)
- softlimit = q->qi_bsoftlimit;
+ softlimit = defq->bsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_btimer);
warns = be16_to_cpu(dqp->q_core.d_bwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
@@ -628,10 +631,10 @@ xfs_trans_dqresv(
ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
if (!hardlimit)
- hardlimit = q->qi_rtbhardlimit;
+ hardlimit = defq->rtbhardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
if (!softlimit)
- softlimit = q->qi_rtbsoftlimit;
+ softlimit = defq->rtbsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
@@ -672,10 +675,10 @@ xfs_trans_dqresv(
warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
if (!hardlimit)
- hardlimit = q->qi_ihardlimit;
+ hardlimit = defq->ihardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
if (!softlimit)
- softlimit = q->qi_isoftlimit;
+ softlimit = defq->isoftlimit;
if (hardlimit && total_count > hardlimit) {
xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index b97f1df910ab..11a3af08b5c7 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -75,18 +75,10 @@ xfs_trans_ichgtime(
tv = current_fs_time(inode->i_sb);
- if ((flags & XFS_ICHGTIME_MOD) &&
- !timespec_equal(&inode->i_mtime, &tv)) {
+ if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
- ip->i_d.di_mtime.t_sec = tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
- }
- if ((flags & XFS_ICHGTIME_CHG) &&
- !timespec_equal(&inode->i_ctime, &tv)) {
+ if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
- ip->i_d.di_ctime.t_sec = tv.tv_sec;
- ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
- }
}
/*
@@ -125,7 +117,7 @@ xfs_trans_log_inode(
*/
if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
IS_I_VERSION(VFS_I(ip))) {
- ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
+ VFS_I(ip)->i_version++;
flags |= XFS_ILOG_CORE;
}