aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_dentry.c2
-rw-r--r--fs/9p/vfs_inode.c5
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/adfs/dir.c1
-rw-r--r--fs/adfs/super.c4
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/namei.c3
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/cmservice.c12
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/afs/inode.c3
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/main.c13
-rw-r--r--fs/afs/mntpt.c63
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/afs/server.c13
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vlocation.c14
-rw-r--r--fs/aio.c31
-rw-r--r--fs/anon_inodes.c29
-rw-r--r--fs/autofs4/autofs_i.h113
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c55
-rw-r--r--fs/autofs4/inode.c114
-rw-r--r--fs/autofs4/root.c743
-rw-r--r--fs/autofs4/symlink.c3
-rw-r--r--fs/autofs4/waitq.c17
-rw-r--r--fs/befs/endian.h16
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/binfmt_elf.c23
-rw-r--r--fs/bio-integrity.c7
-rw-r--r--fs/block_dev.c768
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/compression.c329
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h49
-rw-r--r--fs/btrfs/disk-io.c412
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/export.c12
-rw-r--r--fs/btrfs/extent-tree.c90
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file.c126
-rw-r--r--fs/btrfs/inode.c199
-rw-r--r--fs/btrfs/ioctl.c220
-rw-r--r--fs/btrfs/ioctl.h12
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c18
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/super.c282
-rw-r--r--fs/btrfs/transaction.c11
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/volumes.c654
-rw-r--r--fs/btrfs/volumes.h29
-rw-r--r--fs/btrfs/xattr.c18
-rw-r--r--fs/btrfs/zlib.c369
-rw-r--r--fs/ceph/Makefile23
-rw-r--r--fs/ceph/debugfs.c9
-rw-r--r--fs/ceph/dir.c20
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c56
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/cifs_debug.c10
-rw-r--r--fs/cifs/cifs_dfs_ref.c120
-rw-r--r--fs/cifs/cifsacl.c13
-rw-r--r--fs/cifs/cifsfs.c12
-rw-r--r--fs/cifs/cifsfs.h8
-rw-r--r--fs/cifs/cifsglob.h28
-rw-r--r--fs/cifs/cifspdu.h15
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c59
-rw-r--r--fs/cifs/connect.c151
-rw-r--r--fs/cifs/dir.c27
-rw-r--r--fs/cifs/file.c114
-rw-r--r--fs/cifs/inode.c16
-rw-r--r--fs/cifs/link.c4
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/readdir.c5
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/transport.c427
-rw-r--r--fs/coda/cache.c5
-rw-r--r--fs/coda/cnode.c3
-rw-r--r--fs/coda/coda_cache.h22
-rw-r--r--fs/coda/coda_fs_i.h58
-rw-r--r--fs/coda/coda_linux.c3
-rw-r--r--fs/coda/coda_linux.h101
-rw-r--r--fs/coda/dir.c9
-rw-r--r--fs/coda/file.c3
-rw-r--r--fs/coda/inode.c8
-rw-r--r--fs/coda/pioctl.c4
-rw-r--r--fs/coda/psdev.c4
-rw-r--r--fs/coda/symlink.c4
-rw-r--r--fs/coda/upcall.c5
-rw-r--r--fs/compat.c10
-rw-r--r--fs/configfs/Kconfig4
-rw-r--r--fs/configfs/configfs_internal.h1
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/configfs/mount.c1
-rw-r--r--fs/cramfs/inode.c110
-rw-r--r--fs/dcache.c18
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/Kconfig3
-rw-r--r--fs/ecryptfs/crypto.c30
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c28
-rw-r--r--fs/ecryptfs/inode.c33
-rw-r--r--fs/ecryptfs/keystore.c26
-rw-r--r--fs/ecryptfs/main.c165
-rw-r--r--fs/ecryptfs/mmap.c35
-rw-r--r--fs/eventpoll.c20
-rw-r--r--fs/ext3/super.c12
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/extents.c17
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/inode.c9
-rw-r--r--fs/ext4/super.c12
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/inode.c13
-rw-r--r--fs/fat/namei_msdos.c27
-rw-r--r--fs/fat/namei_vfat.c27
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c105
-rw-r--r--fs/fs_struct.c35
-rw-r--r--fs/fscache/operation.c2
-rw-r--r--fs/fuse/dir.c1
-rw-r--r--fs/fuse/inode.c10
-rw-r--r--fs/gfs2/export.c13
-rw-r--r--fs/gfs2/file.c258
-rw-r--r--fs/gfs2/inode.c72
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/ops_fstype.c10
-rw-r--r--fs/gfs2/ops_inode.c256
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/super.c3
-rw-r--r--fs/hfsplus/dir.c1
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/dentry.c7
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/internal.h4
-rw-r--r--fs/ioctl.c10
-rw-r--r--fs/isofs/inode.c13
-rw-r--r--fs/isofs/namei.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/jffs2/build.c5
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/xattr.c12
-rw-r--r--fs/jfs/jfs_logmgr.c17
-rw-r--r--fs/jfs/namei.c10
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/libfs.c4
-rw-r--r--fs/locks.c8
-rw-r--r--fs/logfs/dev_bdev.c7
-rw-r--r--fs/minix/namei.c2
-rw-r--r--fs/mpage.c49
-rw-r--r--fs/namei.c424
-rw-r--r--fs/namespace.c223
-rw-r--r--fs/ncpfs/dir.c19
-rw-r--r--fs/ncpfs/file.c3
-rw-r--r--fs/ncpfs/inode.c6
-rw-r--r--fs/ncpfs/ioctl.c4
-rw-r--r--fs/ncpfs/mmap.c4
-rw-r--r--fs/ncpfs/ncp_fs.h98
-rw-r--r--fs/ncpfs/ncp_fs_i.h29
-rw-r--r--fs/ncpfs/ncp_fs_sb.h176
-rw-r--r--fs/ncpfs/ncplib_kernel.c2
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/ncpfs/ncpsign_kernel.c1
-rw-r--r--fs/ncpfs/ncpsign_kernel.h2
-rw-r--r--fs/ncpfs/sock.c2
-rw-r--r--fs/ncpfs/symlink.c4
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/getroot.c6
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/namespace.c77
-rw-r--r--fs/nfs/super.c1
-rw-r--r--fs/nfsd/acl.h59
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/idmap.h62
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c151
-rw-r--r--fs/nfsd/nfs4idmap.c15
-rw-r--r--fs/nfsd/nfs4proc.c59
-rw-r--r--fs/nfsd/nfs4recover.c1
-rw-r--r--fs/nfsd/nfs4state.c243
-rw-r--r--fs/nfsd/nfs4xdr.c115
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/state.h16
-rw-r--r--fs/nfsd/vfs.c95
-rw-r--r--fs/nfsd/xdr4.h9
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c35
-rw-r--r--fs/ntfs/super.c6
-rw-r--r--fs/ocfs2/Kconfig3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/export.c6
-rw-r--r--fs/ocfs2/file.c18
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/namei.c5
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/open.c11
-rw-r--r--fs/partitions/check.c106
-rw-r--r--fs/pipe.c16
-rw-r--r--fs/proc/Kconfig6
-rw-r--r--fs/proc/Makefile2
-rw-r--r--fs/proc/array.c28
-rw-r--r--fs/proc/base.c57
-rw-r--r--fs/proc/consoles.c (renamed from fs/proc/proc_console.c)4
-rw-r--r--fs/proc/devices.c4
-rw-r--r--fs/proc/generic.c17
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/meminfo.c14
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/proc_tty.c26
-rw-r--r--fs/proc/softirqs.c6
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c12
-rw-r--r--fs/proc/task_nommu.c7
-rw-r--r--fs/read_write.c27
-rw-r--r--fs/reiserfs/journal.c21
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/select.c2
-rw-r--r--fs/splice.c43
-rw-r--r--fs/squashfs/Kconfig18
-rw-r--r--fs/squashfs/Makefile1
-rw-r--r--fs/squashfs/block.c1
-rw-r--r--fs/squashfs/cache.c1
-rw-r--r--fs/squashfs/decompressor.c16
-rw-r--r--fs/squashfs/decompressor.h9
-rw-r--r--fs/squashfs/fragment.c1
-rw-r--r--fs/squashfs/id.c1
-rw-r--r--fs/squashfs/lzo_wrapper.c1
-rw-r--r--fs/squashfs/squashfs.h8
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/squashfs_fs_i.h6
-rw-r--r--fs/squashfs/xattr_id.c1
-rw-r--r--fs/squashfs/xz_wrapper.c153
-rw-r--r--fs/squashfs/zlib_wrapper.c15
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c21
-rw-r--r--fs/sysfs/Kconfig2
-rw-r--r--fs/sysv/namei.c1
-rw-r--r--fs/sysv/super.c8
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c191
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c587
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c54
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h33
-rw-r--r--fs/xfs/support/debug.c112
-rw-r--r--fs/xfs/support/debug.h25
-rw-r--r--fs/xfs/xfs_alloc.c10
-rw-r--r--fs/xfs/xfs_alloc.h25
-rw-r--r--fs/xfs/xfs_buf_item.c151
-rw-r--r--fs/xfs/xfs_error.c31
-rw-r--r--fs/xfs/xfs_error.h18
-rw-r--r--fs/xfs/xfs_fsops.c10
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_trans.c2
295 files changed, 7689 insertions, 4856 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bab0eac873f4..b789f8e597ec 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -59,7 +59,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
-void v9fs_dentry_release(struct dentry *);
int v9fs_uflags2omode(int uflags, int extended);
ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 466d2a4fc5cb..233b7d4ffe5e 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -86,7 +86,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
*
*/
-void v9fs_dentry_release(struct dentry *dentry)
+static void v9fs_dentry_release(struct dentry *dentry)
{
struct v9fs_dentry *dent;
struct p9_fid *temp, *current_fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5076eeb95502..b76a40bdf4c2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -699,11 +699,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
goto error_iput;
inst_out:
- if (v9ses->cache)
- d_set_d_op(dentry, &v9fs_cached_dentry_operations);
- else
- d_set_d_op(dentry, &v9fs_dentry_operations);
-
d_add(dentry, inode);
return NULL;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index c55c614500ad..dbaabe3b8131 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -141,6 +141,11 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
}
v9fs_fill_super(sb, v9ses, flags, data);
+ if (v9ses->cache)
+ sb->s_d_op = &v9fs_cached_dentry_operations;
+ else
+ sb->s_d_op = &v9fs_dentry_operations;
+
inode = v9fs_get_inode(sb, S_IFDIR | mode);
if (IS_ERR(inode)) {
retval = PTR_ERR(inode);
@@ -217,9 +222,6 @@ static void v9fs_kill_super(struct super_block *s)
P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
- if (s->s_root)
- v9fs_dentry_release(s->s_root); /* clunk root */
-
kill_anon_super(s);
v9fs_session_cancel(v9ses);
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d4..3db9caa57edc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -30,15 +30,6 @@ config FS_MBCACHE
source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
-config FS_POSIX_ACL
-# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
-#
-# NOTE: you can implement Posix ACLs without these helpers (XFS does).
-# Never use this symbol for ifdefs.
-#
- bool
- default n
-
source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
source "fs/ocfs2/Kconfig"
@@ -47,11 +38,19 @@ source "fs/nilfs2/Kconfig"
endif # BLOCK
+# Posix ACL utility routines
+#
+# Note: Posix ACLs can be implemented without these helpers. Never use
+# this symbol for ifdefs in core code.
+#
+config FS_POSIX_ACL
+ def_bool n
+
config EXPORTFS
tristate
config FILE_LOCKING
- bool "Enable POSIX file locking API" if EMBEDDED
+ bool "Enable POSIX file locking API" if EXPERT
default y
help
This option enables standard file locking support, required
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index bf7693c384f9..3b4a764ed780 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -276,7 +276,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
struct object_info obj;
int error;
- d_set_d_op(dentry, &adfs_dentry_operations);
lock_kernel();
error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
if (error == 0) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a4041b52fbca..2d7954049fbe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -473,6 +473,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
asb->s_namelen = ADFS_F_NAME_LEN;
}
+ sb->s_d_op = &adfs_dentry_operations;
root = adfs_iget(sb, &root_obj);
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
@@ -483,8 +484,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
kfree(asb->s_map);
adfs_error(sb, "get root inode failed\n");
goto error;
- } else
- d_set_d_op(sb->s_root, &adfs_dentry_operations);
+ }
unlock_kernel();
return 0;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a8cbdeb34025..0e95f73a7023 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -201,6 +201,7 @@ extern const struct address_space_operations affs_aops;
extern const struct address_space_operations affs_aops_ofs;
extern const struct dentry_operations affs_dentry_operations;
+extern const struct dentry_operations affs_intl_dentry_operations;
static inline void
affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 944a4042fb65..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -32,7 +32,7 @@ const struct dentry_operations affs_dentry_operations = {
.d_compare = affs_compare_dentry,
};
-static const struct dentry_operations affs_intl_dentry_operations = {
+const struct dentry_operations affs_intl_dentry_operations = {
.d_hash = affs_intl_hash_dentry,
.d_compare = affs_intl_compare_dentry,
};
@@ -240,7 +240,6 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
if (IS_ERR(inode))
return ERR_CAST(inode);
}
- d_set_d_op(dentry, AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations);
d_add(dentry, inode);
return NULL;
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d39081bbe7ce..b31507d0f9b9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -477,12 +477,16 @@ got_root:
goto out_error_noinode;
}
+ if (AFFS_SB(sb)->s_flags & SF_INTL)
+ sb->s_d_op = &affs_intl_dentry_operations;
+ else
+ sb->s_d_op = &affs_dentry_operations;
+
sb->s_root = d_alloc_root(root_inode);
if (!sb->s_root) {
printk(KERN_ERR "AFFS: Get root inode failed\n");
goto out_error;
}
- d_set_d_op(sb->s_root, &affs_dentry_operations);
pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
return 0;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3bcec75c54a..1c8c6cc6de30 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -289,7 +289,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
call->server = server;
INIT_WORK(&call->work, SRXAFSCB_CallBack);
- schedule_work(&call->work);
+ queue_work(afs_wq, &call->work);
return 0;
}
@@ -336,7 +336,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
call->server = server;
INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
- schedule_work(&call->work);
+ queue_work(afs_wq, &call->work);
return 0;
}
@@ -367,7 +367,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
call->server = server;
INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
- schedule_work(&call->work);
+ queue_work(afs_wq, &call->work);
return 0;
}
@@ -400,7 +400,7 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
call->state = AFS_CALL_REPLYING;
INIT_WORK(&call->work, SRXAFSCB_Probe);
- schedule_work(&call->work);
+ queue_work(afs_wq, &call->work);
return 0;
}
@@ -496,7 +496,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
call->state = AFS_CALL_REPLYING;
INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
- schedule_work(&call->work);
+ queue_work(afs_wq, &call->work);
return 0;
}
@@ -580,6 +580,6 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
call->state = AFS_CALL_REPLYING;
INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
- schedule_work(&call->work);
+ queue_work(afs_wq, &call->work);
return 0;
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 34a3263d60a4..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,10 +62,11 @@ const struct inode_operations afs_dir_inode_operations = {
.setattr = afs_setattr,
};
-static const struct dentry_operations afs_fs_dentry_operations = {
+const struct dentry_operations afs_fs_dentry_operations = {
.d_revalidate = afs_d_revalidate,
.d_delete = afs_d_delete,
.d_release = afs_d_release,
+ .d_automount = afs_d_automount,
};
#define AFS_DIR_HASHTBL_SIZE 128
@@ -582,8 +583,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
}
success:
- d_set_d_op(dentry, &afs_fs_dentry_operations);
-
d_add(dentry, inode);
_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }",
fid.vnode,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 0747339011c3..db66c5201474 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -184,7 +184,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
inode->i_generation = 0;
set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
- inode->i_flags |= S_NOATIME;
+ set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+ inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
unlock_new_inode(inode);
_leave(" = %p", inode);
return inode;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6d4bc1c8ff60..5a9b6843bac1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -486,6 +486,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
* dir.c
*/
extern const struct inode_operations afs_dir_inode_operations;
+extern const struct dentry_operations afs_fs_dentry_operations;
extern const struct file_operations afs_dir_file_operations;
/*
@@ -576,6 +577,7 @@ extern int afs_drop_inode(struct inode *);
/*
* main.c
*/
+extern struct workqueue_struct *afs_wq;
extern struct afs_uuid afs_uuid;
/*
@@ -590,6 +592,7 @@ extern const struct inode_operations afs_mntpt_inode_operations;
extern const struct inode_operations afs_autocell_inode_operations;
extern const struct file_operations afs_mntpt_file_operations;
+extern struct vfsmount *afs_d_automount(struct path *);
extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
extern void afs_mntpt_kill_timer(void);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index cfd1cbe25b22..42dd2e499ed8 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -30,6 +30,7 @@ module_param(rootcell, charp, 0);
MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
struct afs_uuid afs_uuid;
+struct workqueue_struct *afs_wq;
/*
* get a client UUID
@@ -87,10 +88,16 @@ static int __init afs_init(void)
if (ret < 0)
return ret;
+ /* create workqueue */
+ ret = -ENOMEM;
+ afs_wq = alloc_workqueue("afs", 0, 0);
+ if (!afs_wq)
+ return ret;
+
/* register the /proc stuff */
ret = afs_proc_init();
if (ret < 0)
- return ret;
+ goto error_proc;
#ifdef CONFIG_AFS_FSCACHE
/* we want to be able to cache */
@@ -140,6 +147,8 @@ error_cell_init:
error_cache:
#endif
afs_proc_cleanup();
+error_proc:
+ destroy_workqueue(afs_wq);
rcu_barrier();
printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
return ret;
@@ -163,7 +172,7 @@ static void __exit afs_exit(void)
afs_purge_servers();
afs_callback_update_kill();
afs_vlocation_purge();
- flush_scheduled_work();
+ destroy_workqueue(afs_wq);
afs_cell_purge();
#ifdef CONFIG_AFS_FSCACHE
fscache_unregister_netfs(&afs_cache_netfs);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 6153417caf57..aa59184151d0 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -24,7 +24,6 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
struct dentry *dentry,
struct nameidata *nd);
static int afs_mntpt_open(struct inode *inode, struct file *file);
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
static void afs_mntpt_expiry_timed_out(struct work_struct *work);
const struct file_operations afs_mntpt_file_operations = {
@@ -34,13 +33,11 @@ const struct file_operations afs_mntpt_file_operations = {
const struct inode_operations afs_mntpt_inode_operations = {
.lookup = afs_mntpt_lookup,
- .follow_link = afs_mntpt_follow_link,
.readlink = page_readlink,
.getattr = afs_getattr,
};
const struct inode_operations afs_autocell_inode_operations = {
- .follow_link = afs_mntpt_follow_link,
.getattr = afs_getattr,
};
@@ -88,6 +85,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
_debug("symlink is a mountpoint");
spin_lock(&vnode->lock);
set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
+ vnode->vfs_inode.i_flags |= S_AUTOMOUNT;
spin_unlock(&vnode->lock);
}
@@ -238,52 +236,24 @@ error_no_devname:
}
/*
- * follow a link from a mountpoint directory, thus causing it to be mounted
+ * handle an automount point
*/
-static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *afs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
- int err;
- _enter("%p{%s},{%s:%p{%s},}",
- dentry,
- dentry->d_name.name,
- nd->path.mnt->mnt_devname,
- dentry,
- nd->path.dentry->d_name.name);
-
- dput(nd->path.dentry);
- nd->path.dentry = dget(dentry);
+ _enter("{%s,%s}", path->mnt->mnt_devname, path->dentry->d_name.name);
- newmnt = afs_mntpt_do_automount(nd->path.dentry);
- if (IS_ERR(newmnt)) {
- path_put(&nd->path);
- return (void *)newmnt;
- }
-
- mntget(newmnt);
- err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
- switch (err) {
- case 0:
- path_put(&nd->path);
- nd->path.mnt = newmnt;
- nd->path.dentry = dget(newmnt->mnt_root);
- schedule_delayed_work(&afs_mntpt_expiry_timer,
- afs_mntpt_expiry_timeout * HZ);
- break;
- case -EBUSY:
- /* someone else made a mount here whilst we were busy */
- while (d_mountpoint(nd->path.dentry) &&
- follow_down(&nd->path))
- ;
- err = 0;
- default:
- mntput(newmnt);
- break;
- }
+ newmnt = afs_mntpt_do_automount(path->dentry);
+ if (IS_ERR(newmnt))
+ return newmnt;
- _leave(" = %d", err);
- return ERR_PTR(err);
+ mntget(newmnt); /* prevent immediate expiration */
+ mnt_set_expiry(newmnt, &afs_vfsmounts);
+ queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+ afs_mntpt_expiry_timeout * HZ);
+ _leave(" = %p {%s}", newmnt, newmnt->mnt_devname);
+ return newmnt;
}
/*
@@ -295,8 +265,8 @@ static void afs_mntpt_expiry_timed_out(struct work_struct *work)
if (!list_empty(&afs_vfsmounts)) {
mark_mounts_for_expiry(&afs_vfsmounts);
- schedule_delayed_work(&afs_mntpt_expiry_timer,
- afs_mntpt_expiry_timeout * HZ);
+ queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
+ afs_mntpt_expiry_timeout * HZ);
}
_leave("");
@@ -310,6 +280,5 @@ void afs_mntpt_kill_timer(void)
_enter("");
ASSERT(list_empty(&afs_vfsmounts));
- cancel_delayed_work(&afs_mntpt_expiry_timer);
- flush_scheduled_work();
+ cancel_delayed_work_sync(&afs_mntpt_expiry_timer);
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 654d8fdbf01f..e45a323aebb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -410,7 +410,7 @@ static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
if (!call) {
/* its an incoming call for our callback service */
skb_queue_tail(&afs_incoming_calls, skb);
- schedule_work(&afs_collect_incoming_call_work);
+ queue_work(afs_wq, &afs_collect_incoming_call_work);
} else {
/* route the messages directly to the appropriate call */
skb_queue_tail(&call->rx_queue, skb);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 9fdc7fe3a7bc..d59b7516e943 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -238,8 +238,8 @@ void afs_put_server(struct afs_server *server)
if (atomic_read(&server->usage) == 0) {
list_move_tail(&server->grave, &afs_server_graveyard);
server->time_of_death = get_seconds();
- schedule_delayed_work(&afs_server_reaper,
- afs_server_timeout * HZ);
+ queue_delayed_work(afs_wq, &afs_server_reaper,
+ afs_server_timeout * HZ);
}
spin_unlock(&afs_server_graveyard_lock);
_leave(" [dead]");
@@ -285,10 +285,11 @@ static void afs_reap_server(struct work_struct *work)
expiry = server->time_of_death + afs_server_timeout;
if (expiry > now) {
delay = (expiry - now) * HZ;
- if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+ if (!queue_delayed_work(afs_wq, &afs_server_reaper,
+ delay)) {
cancel_delayed_work(&afs_server_reaper);
- schedule_delayed_work(&afs_server_reaper,
- delay);
+ queue_delayed_work(afs_wq, &afs_server_reaper,
+ delay);
}
break;
}
@@ -323,5 +324,5 @@ void __exit afs_purge_servers(void)
{
afs_server_timeout = 0;
cancel_delayed_work(&afs_server_reaper);
- schedule_delayed_work(&afs_server_reaper, 0);
+ queue_delayed_work(afs_wq, &afs_server_reaper, 0);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f901a9d7c111..fb240e8766d6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -336,6 +336,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
if (!root)
goto error;
+ sb->s_d_op = &afs_fs_dentry_operations;
sb->s_root = root;
_leave(" = 0");
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 9ac260d1361d..431984d2e372 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -507,8 +507,8 @@ void afs_put_vlocation(struct afs_vlocation *vl)
_debug("buried");
list_move_tail(&vl->grave, &afs_vlocation_graveyard);
vl->time_of_death = get_seconds();
- schedule_delayed_work(&afs_vlocation_reap,
- afs_vlocation_timeout * HZ);
+ queue_delayed_work(afs_wq, &afs_vlocation_reap,
+ afs_vlocation_timeout * HZ);
/* suspend updates on this record */
if (!list_empty(&vl->update)) {
@@ -561,11 +561,11 @@ static void afs_vlocation_reaper(struct work_struct *work)
if (expiry > now) {
delay = (expiry - now) * HZ;
_debug("delay %lu", delay);
- if (!schedule_delayed_work(&afs_vlocation_reap,
- delay)) {
+ if (!queue_delayed_work(afs_wq, &afs_vlocation_reap,
+ delay)) {
cancel_delayed_work(&afs_vlocation_reap);
- schedule_delayed_work(&afs_vlocation_reap,
- delay);
+ queue_delayed_work(afs_wq, &afs_vlocation_reap,
+ delay);
}
break;
}
@@ -620,7 +620,7 @@ void afs_vlocation_purge(void)
destroy_workqueue(afs_vlocation_update_worker);
cancel_delayed_work(&afs_vlocation_reap);
- schedule_delayed_work(&afs_vlocation_reap, 0);
+ queue_delayed_work(afs_wq, &afs_vlocation_reap, 0);
}
/*
diff --git a/fs/aio.c b/fs/aio.c
index 8c8f6c5b6d79..fc557a3be0a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -87,7 +87,7 @@ static int __init aio_setup(void)
aio_wq = create_workqueue("aio");
abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
- BUG_ON(!abe_pool);
+ BUG_ON(!aio_wq || !abe_pool);
pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -798,29 +798,12 @@ static void aio_queue_work(struct kioctx * ctx)
queue_delayed_work(aio_wq, &ctx->wq, timeout);
}
-
-/*
- * aio_run_iocbs:
- * Process all pending retries queued on the ioctx
- * run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static inline void aio_run_iocbs(struct kioctx *ctx)
-{
- int requeue;
-
- spin_lock_irq(&ctx->ctx_lock);
-
- requeue = __aio_run_iocbs(ctx);
- spin_unlock_irq(&ctx->ctx_lock);
- if (requeue)
- aio_queue_work(ctx);
-}
-
/*
- * just like aio_run_iocbs, but keeps running them until
- * the list stays empty
+ * aio_run_all_iocbs:
+ * Process all pending retries queued on the ioctx
+ * run list, and keep running them until the list
+ * stays empty.
+ * Assumes it is operating within the aio issuer's mm context.
*/
static inline void aio_run_all_iocbs(struct kioctx *ctx)
{
@@ -1839,7 +1822,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long ret = -EINVAL;
if (likely(ioctx)) {
- if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
+ if (likely(min_nr <= nr && min_nr >= 0))
ret = read_events(ioctx, min_nr, nr, events, timeout);
put_ioctx(ioctx);
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 5fd38112a6ca..c5567cb78432 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -26,12 +26,6 @@ static struct vfsmount *anon_inode_mnt __read_mostly;
static struct inode *anon_inode_inode;
static const struct file_operations anon_inode_fops;
-static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC);
-}
-
/*
* anon_inodefs_dname() is called from d_path().
*/
@@ -41,14 +35,22 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
dentry->d_name.name);
}
+static const struct dentry_operations anon_inodefs_dentry_operations = {
+ .d_dname = anon_inodefs_dname,
+};
+
+static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_pseudo(fs_type, "anon_inode:", NULL,
+ &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+}
+
static struct file_system_type anon_inode_fs_type = {
.name = "anon_inodefs",
.mount = anon_inodefs_mount,
.kill_sb = kill_anon_super,
};
-static const struct dentry_operations anon_inodefs_dentry_operations = {
- .d_dname = anon_inodefs_dname,
-};
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -64,9 +66,9 @@ static const struct address_space_operations anon_aops = {
};
/**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
- * anonymous inode, and a dentry that describe the "class"
- * of the file
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
+ * anonymous inode, and a dentry that describe the "class"
+ * of the file
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -113,7 +115,6 @@ struct file *anon_inode_getfile(const char *name,
*/
ihold(anon_inode_inode);
- d_set_d_op(path.dentry, &anon_inodefs_dentry_operations);
d_instantiate(path.dentry, anon_inode_inode);
error = -ENFILE;
@@ -232,7 +233,7 @@ static int __init anon_inode_init(void)
return 0;
err_mntput:
- mntput_long(anon_inode_mnt);
+ mntput(anon_inode_mnt);
err_unregister_filesystem:
unregister_filesystem(&anon_inode_fs_type);
err_exit:
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0fffe1c24cec..54f923792728 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -88,18 +88,9 @@ struct autofs_info {
uid_t uid;
gid_t gid;
-
- mode_t mode;
- size_t size;
-
- void (*free)(struct autofs_info *);
- union {
- const char *symlink;
- } u;
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
-#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
struct autofs_wait_queue {
@@ -176,14 +167,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
return 0;
}
-static inline void autofs4_copy_atime(struct file *src, struct file *dst)
-{
- dst->f_path.dentry->d_inode->i_atime =
- src->f_path.dentry->d_inode->i_atime;
- return;
-}
-
-struct inode *autofs4_get_inode(struct super_block *, struct autofs_info *);
+struct inode *autofs4_get_inode(struct super_block *, mode_t);
void autofs4_free_ino(struct autofs_info *);
/* Expiration */
@@ -212,16 +196,89 @@ void autofs_dev_ioctl_exit(void);
extern const struct inode_operations autofs4_symlink_inode_operations;
extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct inode_operations autofs4_root_inode_operations;
-extern const struct inode_operations autofs4_indirect_root_inode_operations;
-extern const struct inode_operations autofs4_direct_root_inode_operations;
extern const struct file_operations autofs4_dir_operations;
extern const struct file_operations autofs4_root_operations;
+extern const struct dentry_operations autofs4_dentry_operations;
+
+/* VFS automount flags management functions */
+
+static inline void __managed_dentry_set_automount(struct dentry *dentry)
+{
+ dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_set_automount(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_set_automount(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_automount(struct dentry *dentry)
+{
+ dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
+}
+
+static inline void managed_dentry_clear_automount(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_clear_automount(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_transit(struct dentry *dentry)
+{
+ dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_set_transit(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_set_transit(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_transit(struct dentry *dentry)
+{
+ dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
+}
+
+static inline void managed_dentry_clear_transit(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_clear_transit(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_set_managed(struct dentry *dentry)
+{
+ dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_set_managed(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_set_managed(dentry);
+ spin_unlock(&dentry->d_lock);
+}
+
+static inline void __managed_dentry_clear_managed(struct dentry *dentry)
+{
+ dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
+}
+
+static inline void managed_dentry_clear_managed(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_clear_managed(dentry);
+ spin_unlock(&dentry->d_lock);
+}
/* Initializing function */
int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info *sbi, mode_t mode);
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
+void autofs4_clean_ino(struct autofs_info *);
/* Queue management functions */
@@ -229,19 +286,6 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct path *path)
-{
- int res = 0;
-
- while (d_mountpoint(path->dentry)) {
- int followed = follow_down(path);
- if (!followed)
- break;
- res = 1;
- }
- return res;
-}
-
static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
{
return new_encode_dev(sbi->sb->s_dev);
@@ -294,5 +338,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
return;
}
-void autofs4_dentry_release(struct dentry *);
extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index eff9a419469a..1442da4860e5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -551,7 +551,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
err = have_submounts(path.dentry);
- if (follow_down(&path))
+ if (follow_down_one(&path))
magic = path.mnt->mnt_sb->s_magic;
}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cc1d01365905..f43100b9662b 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -26,10 +26,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
if (ino == NULL)
return 0;
- /* No point expiring a pending mount */
- if (ino->flags & AUTOFS_INF_PENDING)
- return 0;
-
if (!do_now) {
/* Too young to die */
if (!timeout || time_after(ino->last_used + timeout, now))
@@ -56,7 +52,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
path_get(&path);
- if (!follow_down(&path))
+ if (!follow_down_one(&path))
goto done;
if (is_autofs4_dentry(path.dentry)) {
@@ -100,7 +96,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
struct dentry *p, *ret;
if (prev == NULL)
- return dget(prev);
+ return dget(root);
spin_lock(&autofs4_lock);
relock:
@@ -137,7 +133,7 @@ again:
spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED);
/* Negative dentry - try next */
if (!simple_positive(ret)) {
- spin_unlock(&ret->d_lock);
+ spin_unlock(&p->d_lock);
p = ret;
goto again;
}
@@ -283,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
unsigned long timeout;
struct dentry *root = dget(sb->s_root);
int do_now = how & AUTOFS_EXP_IMMEDIATE;
+ struct autofs_info *ino;
if (!root)
return NULL;
@@ -291,19 +288,21 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
timeout = sbi->exp_timeout;
spin_lock(&sbi->fs_lock);
+ ino = autofs4_dentry_ino(root);
+ /* No point expiring a pending mount */
+ if (ino->flags & AUTOFS_INF_PENDING) {
+ spin_unlock(&sbi->fs_lock);
+ return NULL;
+ }
+ managed_dentry_set_transit(root);
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
struct autofs_info *ino = autofs4_dentry_ino(root);
- if (d_mountpoint(root)) {
- ino->flags |= AUTOFS_INF_MOUNTPOINT;
- spin_lock(&root->d_lock);
- root->d_flags &= ~DCACHE_MOUNTED;
- spin_unlock(&root->d_lock);
- }
ino->flags |= AUTOFS_INF_EXPIRING;
init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
return root;
}
+ managed_dentry_clear_transit(root);
spin_unlock(&sbi->fs_lock);
dput(root);
@@ -340,6 +339,10 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
while ((dentry = get_next_positive_dentry(dentry, root))) {
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
+ /* No point expiring a pending mount */
+ if (ino->flags & AUTOFS_INF_PENDING)
+ goto cont;
+ managed_dentry_set_transit(dentry);
/*
* Case 1: (i) indirect mount or top level pseudo direct mount
@@ -399,6 +402,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
}
}
next:
+ managed_dentry_clear_transit(dentry);
+cont:
spin_unlock(&sbi->fs_lock);
}
return NULL;
@@ -479,6 +484,8 @@ int autofs4_expire_run(struct super_block *sb,
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
ino->flags &= ~AUTOFS_INF_EXPIRING;
+ if (!d_unhashed(dentry))
+ managed_dentry_clear_transit(dentry);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
@@ -504,18 +511,18 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
- if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
- spin_lock(&sb->s_root->d_lock);
- /*
- * If we haven't been expired away, then reset
- * mounted status.
- */
- if (mnt->mnt_parent != mnt)
- sb->s_root->d_flags |= DCACHE_MOUNTED;
- spin_unlock(&sb->s_root->d_lock);
- ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
- }
ino->flags &= ~AUTOFS_INF_EXPIRING;
+ spin_lock(&dentry->d_lock);
+ if (ret)
+ __managed_dentry_clear_transit(dentry);
+ else {
+ if ((IS_ROOT(dentry) ||
+ (autofs_type_indirect(sbi->type) &&
+ IS_ROOT(dentry->d_parent))) &&
+ !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+ __managed_dentry_set_automount(dentry);
+ }
+ spin_unlock(&dentry->d_lock);
complete_all(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a7bdb9dcac84..180fa2425e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -22,77 +22,27 @@
#include "autofs_i.h"
#include <linux/module.h>
-static void ino_lnkfree(struct autofs_info *ino)
+struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
{
- if (ino->u.symlink) {
- kfree(ino->u.symlink);
- ino->u.symlink = NULL;
- }
-}
-
-struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
- struct autofs_sb_info *sbi, mode_t mode)
-{
- int reinit = 1;
-
- if (ino == NULL) {
- reinit = 0;
- ino = kmalloc(sizeof(*ino), GFP_KERNEL);
- }
-
- if (ino == NULL)
- return NULL;
-
- if (!reinit) {
- ino->flags = 0;
- ino->inode = NULL;
- ino->dentry = NULL;
- ino->size = 0;
+ struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+ if (ino) {
INIT_LIST_HEAD(&ino->active);
- ino->active_count = 0;
INIT_LIST_HEAD(&ino->expiring);
- atomic_set(&ino->count, 0);
+ ino->last_used = jiffies;
+ ino->sbi = sbi;
}
+ return ino;
+}
+void autofs4_clean_ino(struct autofs_info *ino)
+{
ino->uid = 0;
ino->gid = 0;
- ino->mode = mode;
ino->last_used = jiffies;
-
- ino->sbi = sbi;
-
- if (reinit && ino->free)
- (ino->free)(ino);
-
- memset(&ino->u, 0, sizeof(ino->u));
-
- ino->free = NULL;
-
- if (S_ISLNK(mode))
- ino->free = ino_lnkfree;
-
- return ino;
}
void autofs4_free_ino(struct autofs_info *ino)
{
- struct autofs_info *p_ino;
-
- if (ino->dentry) {
- ino->dentry->d_fsdata = NULL;
- if (ino->dentry->d_inode) {
- struct dentry *parent = ino->dentry->d_parent;
- if (atomic_dec_and_test(&ino->count)) {
- p_ino = autofs4_dentry_ino(parent);
- if (p_ino && parent != ino->dentry)
- atomic_dec(&p_ino->count);
- }
- dput(ino->dentry);
- }
- ino->dentry = NULL;
- }
- if (ino->free)
- (ino->free)(ino);
kfree(ino);
}
@@ -148,9 +98,16 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}
+static void autofs4_evict_inode(struct inode *inode)
+{
+ end_writeback(inode);
+ kfree(inode->i_private);
+}
+
static const struct super_operations autofs4_sops = {
.statfs = simple_statfs,
.show_options = autofs4_show_options,
+ .evict_inode = autofs4_evict_inode,
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -240,21 +197,6 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
return (*pipefd < 0);
}
-static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
-{
- struct autofs_info *ino;
-
- ino = autofs4_init_ino(NULL, sbi, S_IFDIR | 0755);
- if (!ino)
- return NULL;
-
- return ino;
-}
-
-static const struct dentry_operations autofs4_sb_dentry_operations = {
- .d_release = autofs4_dentry_release,
-};
-
int autofs4_fill_super(struct super_block *s, void *data, int silent)
{
struct inode * root_inode;
@@ -292,15 +234,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
s->s_blocksize_bits = 10;
s->s_magic = AUTOFS_SUPER_MAGIC;
s->s_op = &autofs4_sops;
+ s->s_d_op = &autofs4_dentry_operations;
s->s_time_gran = 1;
/*
* Get the root inode and dentry, but defer checking for errors.
*/
- ino = autofs4_mkroot(sbi);
+ ino = autofs4_new_ino(sbi);
if (!ino)
goto fail_free;
- root_inode = autofs4_get_inode(s, ino);
+ root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
if (!root_inode)
goto fail_ino;
@@ -309,7 +252,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
goto fail_iput;
pipe = NULL;
- d_set_d_op(root, &autofs4_sb_dentry_operations);
root->d_fsdata = ino;
/* Can this call block? */
@@ -320,10 +262,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
goto fail_dput;
}
+ if (autofs_type_trigger(sbi->type))
+ __managed_dentry_set_managed(root);
+
root_inode->i_fop = &autofs4_root_operations;
- root_inode->i_op = autofs_type_trigger(sbi->type) ?
- &autofs4_direct_root_inode_operations :
- &autofs4_indirect_root_inode_operations;
+ root_inode->i_op = &autofs4_dir_inode_operations;
/* Couldn't this be tested earlier? */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
@@ -383,16 +326,14 @@ fail_unlock:
return -EINVAL;
}
-struct inode *autofs4_get_inode(struct super_block *sb,
- struct autofs_info *inf)
+struct inode *autofs4_get_inode(struct super_block *sb, mode_t mode)
{
struct inode *inode = new_inode(sb);
if (inode == NULL)
return NULL;
- inf->inode = inode;
- inode->i_mode = inf->mode;
+ inode->i_mode = mode;
if (sb->s_root) {
inode->i_uid = sb->s_root->d_inode->i_uid;
inode->i_gid = sb->s_root->d_inode->i_gid;
@@ -400,12 +341,11 @@ struct inode *autofs4_get_inode(struct super_block *sb,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_ino = get_next_ino();
- if (S_ISDIR(inf->mode)) {
+ if (S_ISDIR(mode)) {
inode->i_nlink = 2;
inode->i_op = &autofs4_dir_inode_operations;
inode->i_fop = &autofs4_dir_operations;
- } else if (S_ISLNK(inf->mode)) {
- inode->i_size = inf->size;
+ } else if (S_ISLNK(mode)) {
inode->i_op = &autofs4_symlink_inode_operations;
}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 651e4ef563b1..014e7aba3b08 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -35,10 +35,9 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
-static void *autofs4_follow_link(struct dentry *, struct nameidata *);
-
-#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
-#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
+static struct vfsmount *autofs4_d_automount(struct path *);
+static int autofs4_d_manage(struct dentry *, bool, bool);
+static void autofs4_dentry_release(struct dentry *);
const struct file_operations autofs4_root_operations = {
.open = dcache_dir_open,
@@ -60,7 +59,7 @@ const struct file_operations autofs4_dir_operations = {
.llseek = dcache_dir_lseek,
};
-const struct inode_operations autofs4_indirect_root_inode_operations = {
+const struct inode_operations autofs4_dir_inode_operations = {
.lookup = autofs4_lookup,
.unlink = autofs4_dir_unlink,
.symlink = autofs4_dir_symlink,
@@ -68,20 +67,10 @@ const struct inode_operations autofs4_indirect_root_inode_operations = {
.rmdir = autofs4_dir_rmdir,
};
-const struct inode_operations autofs4_direct_root_inode_operations = {
- .lookup = autofs4_lookup,
- .unlink = autofs4_dir_unlink,
- .mkdir = autofs4_dir_mkdir,
- .rmdir = autofs4_dir_rmdir,
- .follow_link = autofs4_follow_link,
-};
-
-const struct inode_operations autofs4_dir_inode_operations = {
- .lookup = autofs4_lookup,
- .unlink = autofs4_dir_unlink,
- .symlink = autofs4_dir_symlink,
- .mkdir = autofs4_dir_mkdir,
- .rmdir = autofs4_dir_rmdir,
+const struct dentry_operations autofs4_dentry_operations = {
+ .d_automount = autofs4_d_automount,
+ .d_manage = autofs4_d_manage,
+ .d_release = autofs4_dentry_release,
};
static void autofs4_add_active(struct dentry *dentry)
@@ -116,14 +105,6 @@ static void autofs4_del_active(struct dentry *dentry)
return;
}
-static unsigned int autofs4_need_mount(unsigned int flags)
-{
- unsigned int res = 0;
- if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
- res = 1;
- return res;
-}
-
static int autofs4_dir_open(struct inode *inode, struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
@@ -158,278 +139,27 @@ out:
return dcache_dir_open(inode, file);
}
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
-{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
- int status;
-
- DPRINTK("dentry=%p %.*s ino=%p",
- dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-
- /*
- * Wait for a pending mount, triggering one if there
- * isn't one already
- */
- if (dentry->d_inode == NULL) {
- DPRINTK("waiting for mount name=%.*s",
- dentry->d_name.len, dentry->d_name.name);
-
- status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
- DPRINTK("mount done status=%d", status);
-
- /* Turn this into a real negative dentry? */
- if (status == -ENOENT) {
- spin_lock(&sbi->fs_lock);
- ino->flags &= ~AUTOFS_INF_PENDING;
- spin_unlock(&sbi->fs_lock);
- return status;
- } else if (status) {
- /* Return a negative dentry, but leave it "pending" */
- return status;
- }
- /* Trigger mount for path component or follow link */
- } else if (ino->flags & AUTOFS_INF_PENDING ||
- autofs4_need_mount(flags)) {
- DPRINTK("waiting for mount name=%.*s",
- dentry->d_name.len, dentry->d_name.name);
-
- spin_lock(&sbi->fs_lock);
- ino->flags |= AUTOFS_INF_PENDING;
- spin_unlock(&sbi->fs_lock);
- status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-
- DPRINTK("mount done status=%d", status);
-
- if (status) {
- spin_lock(&sbi->fs_lock);
- ino->flags &= ~AUTOFS_INF_PENDING;
- spin_unlock(&sbi->fs_lock);
- return status;
- }
- }
-
- /* Initialize expiry counter after successful mount */
- ino->last_used = jiffies;
-
- spin_lock(&sbi->fs_lock);
- ino->flags &= ~AUTOFS_INF_PENDING;
- spin_unlock(&sbi->fs_lock);
-
- return 0;
-}
-
-/* For autofs direct mounts the follow link triggers the mount */
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
- int oz_mode = autofs4_oz_mode(sbi);
- unsigned int lookup_type;
- int status;
-
- DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
- dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
- nd->flags);
- /*
- * For an expire of a covered direct or offset mount we need
- * to break out of follow_down() at the autofs mount trigger
- * (d_mounted--), so we can see the expiring flag, and manage
- * the blocking and following here until the expire is completed.
- */
- if (oz_mode) {
- spin_lock(&sbi->fs_lock);
- if (ino->flags & AUTOFS_INF_EXPIRING) {
- spin_unlock(&sbi->fs_lock);
- /* Follow down to our covering mount. */
- if (!follow_down(&nd->path))
- goto done;
- goto follow;
- }
- spin_unlock(&sbi->fs_lock);
- goto done;
- }
-
- /* If an expire request is pending everyone must wait. */
- autofs4_expire_wait(dentry);
-
- /* We trigger a mount for almost all flags */
- lookup_type = autofs4_need_mount(nd->flags);
- spin_lock(&sbi->fs_lock);
- spin_lock(&autofs4_lock);
- spin_lock(&dentry->d_lock);
- if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
- spin_unlock(&dentry->d_lock);
- spin_unlock(&autofs4_lock);
- spin_unlock(&sbi->fs_lock);
- goto follow;
- }
-
- /*
- * If the dentry contains directories then it is an autofs
- * multi-mount with no root mount offset. So don't try to
- * mount it again.
- */
- if (ino->flags & AUTOFS_INF_PENDING ||
- (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
- spin_unlock(&dentry->d_lock);
- spin_unlock(&autofs4_lock);
- spin_unlock(&sbi->fs_lock);
-
- status = try_to_fill_dentry(dentry, nd->flags);
- if (status)
- goto out_error;
-
- goto follow;
- }
- spin_unlock(&dentry->d_lock);
- spin_unlock(&autofs4_lock);
- spin_unlock(&sbi->fs_lock);
-follow:
- /*
- * If there is no root mount it must be an autofs
- * multi-mount with no root offset so we don't need
- * to follow it.
- */
- if (d_mountpoint(dentry)) {
- if (!autofs4_follow_mount(&nd->path)) {
- status = -ENOENT;
- goto out_error;
- }
- }
-
-done:
- return NULL;
-
-out_error:
- path_put(&nd->path);
- return ERR_PTR(status);
-}
-
-/*
- * Revalidate is called on every cache lookup. Some of those
- * cache lookups may actually happen while the dentry is not
- * yet completely filled in, and revalidate has to delay such
- * lookups..
- */
-static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
+static void autofs4_dentry_release(struct dentry *de)
{
- struct inode *dir;
- struct autofs_sb_info *sbi;
- int oz_mode;
- int flags = nd ? nd->flags : 0;
- int status = 1;
-
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- dir = dentry->d_parent->d_inode;
- sbi = autofs4_sbi(dir->i_sb);
- oz_mode = autofs4_oz_mode(sbi);
-
- /* Pending dentry */
- spin_lock(&sbi->fs_lock);
- if (autofs4_ispending(dentry)) {
- /* The daemon never causes a mount to trigger */
- spin_unlock(&sbi->fs_lock);
-
- if (oz_mode)
- return 1;
-
- /*
- * If the directory has gone away due to an expire
- * we have been called as ->d_revalidate() and so
- * we need to return false and proceed to ->lookup().
- */
- if (autofs4_expire_wait(dentry) == -EAGAIN)
- return 0;
-
- /*
- * A zero status is success otherwise we have a
- * negative error code.
- */
- status = try_to_fill_dentry(dentry, flags);
- if (status == 0)
- return 1;
-
- return status;
- }
- spin_unlock(&sbi->fs_lock);
-
- /* Negative dentry.. invalidate if "old" */
- if (dentry->d_inode == NULL)
- return 0;
-
- /* Check for a non-mountpoint directory with no contents */
- spin_lock(&autofs4_lock);
- spin_lock(&dentry->d_lock);
- if (S_ISDIR(dentry->d_inode->i_mode) &&
- !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
- DPRINTK("dentry=%p %.*s, emptydir",
- dentry, dentry->d_name.len, dentry->d_name.name);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&autofs4_lock);
-
- /* The daemon never causes a mount to trigger */
- if (oz_mode)
- return 1;
-
- /*
- * A zero status is success otherwise we have a
- * negative error code.
- */
- status = try_to_fill_dentry(dentry, flags);
- if (status == 0)
- return 1;
-
- return status;
- }
- spin_unlock(&dentry->d_lock);
- spin_unlock(&autofs4_lock);
-
- return 1;
-}
-
-void autofs4_dentry_release(struct dentry *de)
-{
- struct autofs_info *inf;
+ struct autofs_info *ino = autofs4_dentry_ino(de);
+ struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
DPRINTK("releasing %p", de);
- inf = autofs4_dentry_ino(de);
- de->d_fsdata = NULL;
-
- if (inf) {
- struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
-
- if (sbi) {
- spin_lock(&sbi->lookup_lock);
- if (!list_empty(&inf->active))
- list_del(&inf->active);
- if (!list_empty(&inf->expiring))
- list_del(&inf->expiring);
- spin_unlock(&sbi->lookup_lock);
- }
-
- inf->dentry = NULL;
- inf->inode = NULL;
+ if (!ino)
+ return;
- autofs4_free_ino(inf);
+ if (sbi) {
+ spin_lock(&sbi->lookup_lock);
+ if (!list_empty(&ino->active))
+ list_del(&ino->active);
+ if (!list_empty(&ino->expiring))
+ list_del(&ino->expiring);
+ spin_unlock(&sbi->lookup_lock);
}
-}
-/* For dentries of directories in the root dir */
-static const struct dentry_operations autofs4_root_dentry_operations = {
- .d_revalidate = autofs4_revalidate,
- .d_release = autofs4_dentry_release,
-};
-
-/* For other dentries */
-static const struct dentry_operations autofs4_dentry_operations = {
- .d_revalidate = autofs4_revalidate,
- .d_release = autofs4_dentry_release,
-};
+ autofs4_free_ino(ino);
+}
static struct dentry *autofs4_lookup_active(struct dentry *dentry)
{
@@ -541,51 +271,246 @@ next:
return NULL;
}
+static int autofs4_mount_wait(struct dentry *dentry)
+{
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ int status;
+
+ if (ino->flags & AUTOFS_INF_PENDING) {
+ DPRINTK("waiting for mount name=%.*s",
+ dentry->d_name.len, dentry->d_name.name);
+ status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+ DPRINTK("mount wait done status=%d", status);
+ ino->last_used = jiffies;
+ return status;
+ }
+ return 0;
+}
+
+static int do_expire_wait(struct dentry *dentry)
+{
+ struct dentry *expiring;
+
+ expiring = autofs4_lookup_expiring(dentry);
+ if (!expiring)
+ return autofs4_expire_wait(dentry);
+ else {
+ /*
+ * If we are racing with expire the request might not
+ * be quite complete, but the directory has been removed
+ * so it must have been successful, just wait for it.
+ */
+ autofs4_expire_wait(expiring);
+ autofs4_del_expiring(expiring);
+ dput(expiring);
+ }
+ return 0;
+}
+
+static struct dentry *autofs4_mountpoint_changed(struct path *path)
+{
+ struct dentry *dentry = path->dentry;
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+ /*
+ * If this is an indirect mount the dentry could have gone away
+ * as a result of an expire and a new one created.
+ */
+ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
+ struct dentry *parent = dentry->d_parent;
+ struct dentry *new = d_lookup(parent, &dentry->d_name);
+ if (!new)
+ return NULL;
+ dput(path->dentry);
+ path->dentry = new;
+ }
+ return path->dentry;
+}
+
+static struct vfsmount *autofs4_d_automount(struct path *path)
+{
+ struct dentry *dentry = path->dentry;
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ int status;
+
+ DPRINTK("dentry=%p %.*s",
+ dentry, dentry->d_name.len, dentry->d_name.name);
+
+ /*
+ * Someone may have manually umounted this or it was a submount
+ * that has gone away.
+ */
+ spin_lock(&dentry->d_lock);
+ if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+ if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+ (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+ __managed_dentry_set_transit(path->dentry);
+ }
+ spin_unlock(&dentry->d_lock);
+
+ /* The daemon never triggers a mount. */
+ if (autofs4_oz_mode(sbi))
+ return NULL;
+
+ /*
+ * If an expire request is pending everyone must wait.
+ * If the expire fails we're still mounted so continue
+ * the follow and return. A return of -EAGAIN (which only
+ * happens with indirect mounts) means the expire completed
+ * and the directory was removed, so just go ahead and try
+ * the mount.
+ */
+ status = do_expire_wait(dentry);
+ if (status && status != -EAGAIN)
+ return NULL;
+
+ /* Callback to the daemon to perform the mount or wait */
+ spin_lock(&sbi->fs_lock);
+ if (ino->flags & AUTOFS_INF_PENDING) {
+ spin_unlock(&sbi->fs_lock);
+ status = autofs4_mount_wait(dentry);
+ if (status)
+ return ERR_PTR(status);
+ spin_lock(&sbi->fs_lock);
+ goto done;
+ }
+
+ /*
+ * If the dentry is a symlink it's equivalent to a directory
+ * having d_mountpoint() true, so there's no need to call back
+ * to the daemon.
+ */
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+ goto done;
+ if (!d_mountpoint(dentry)) {
+ /*
+ * It's possible that user space hasn't removed directories
+ * after umounting a rootless multi-mount, although it
+ * should. For v5 have_submounts() is sufficient to handle
+ * this because the leaves of the directory tree under the
+ * mount never trigger mounts themselves (they have an autofs
+ * trigger mount mounted on them). But v4 pseudo direct mounts
+ * do need the leaves to to trigger mounts. In this case we
+ * have no choice but to use the list_empty() check and
+ * require user space behave.
+ */
+ if (sbi->version > 4) {
+ if (have_submounts(dentry))
+ goto done;
+ } else {
+ spin_lock(&dentry->d_lock);
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&dentry->d_lock);
+ goto done;
+ }
+ spin_unlock(&dentry->d_lock);
+ }
+ ino->flags |= AUTOFS_INF_PENDING;
+ spin_unlock(&sbi->fs_lock);
+ status = autofs4_mount_wait(dentry);
+ if (status)
+ return ERR_PTR(status);
+ spin_lock(&sbi->fs_lock);
+ ino->flags &= ~AUTOFS_INF_PENDING;
+ }
+done:
+ if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+ /*
+ * Any needed mounting has been completed and the path updated
+ * so turn this into a normal dentry so we don't continually
+ * call ->d_automount() and ->d_manage().
+ */
+ spin_lock(&dentry->d_lock);
+ __managed_dentry_clear_transit(dentry);
+ /*
+ * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
+ * symlinks as in all other cases the dentry will be covered by
+ * an actual mount so ->d_automount() won't be called during
+ * the follow.
+ */
+ if ((!d_mountpoint(dentry) &&
+ !list_empty(&dentry->d_subdirs)) ||
+ (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+ __managed_dentry_clear_automount(dentry);
+ spin_unlock(&dentry->d_lock);
+ }
+ spin_unlock(&sbi->fs_lock);
+
+ /* Mount succeeded, check if we ended up with a new dentry */
+ dentry = autofs4_mountpoint_changed(path);
+ if (!dentry)
+ return ERR_PTR(-ENOENT);
+
+ return NULL;
+}
+
+int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+{
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+ DPRINTK("dentry=%p %.*s",
+ dentry, dentry->d_name.len, dentry->d_name.name);
+
+ /* The daemon never waits. */
+ if (autofs4_oz_mode(sbi) || mounting_here) {
+ if (!d_mountpoint(dentry))
+ return -EISDIR;
+ return 0;
+ }
+
+ /* We need to sleep, so we need pathwalk to be in ref-mode */
+ if (rcu_walk)
+ return -ECHILD;
+
+ /* Wait for pending expires */
+ do_expire_wait(dentry);
+
+ /*
+ * This dentry may be under construction so wait on mount
+ * completion.
+ */
+ return autofs4_mount_wait(dentry);
+}
+
/* Lookups in the root directory */
static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
- struct dentry *expiring, *active;
- int oz_mode;
+ struct dentry *active;
- DPRINTK("name = %.*s",
- dentry->d_name.len, dentry->d_name.name);
+ DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
sbi = autofs4_sbi(dir->i_sb);
- oz_mode = autofs4_oz_mode(sbi);
DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
- current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
+ current->pid, task_pgrp_nr(current), sbi->catatonic,
+ autofs4_oz_mode(sbi));
active = autofs4_lookup_active(dentry);
if (active) {
- dentry = active;
- ino = autofs4_dentry_ino(dentry);
+ return active;
} else {
/*
- * Mark the dentry incomplete but don't hash it. We do this
- * to serialize our inode creation operations (symlink and
- * mkdir) which prevents deadlock during the callback to
- * the daemon. Subsequent user space lookups for the same
- * dentry are placed on the wait queue while the daemon
- * itself is allowed passage unresticted so the create
- * operation itself can then hash the dentry. Finally,
- * we check for the hashed dentry and return the newly
- * hashed dentry.
+ * A dentry that is not within the root can never trigger a
+ * mount operation, unless the directory already exists, so we
+ * can return fail immediately. The daemon however does need
+ * to create directories within the file system.
*/
- d_set_d_op(dentry, &autofs4_root_dentry_operations);
+ if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+ return ERR_PTR(-ENOENT);
- /*
- * And we need to ensure that the same dentry is used for
- * all following lookup calls until it is hashed so that
- * the dentry flags are persistent throughout the request.
- */
- ino = autofs4_init_ino(NULL, sbi, 0555);
+ /* Mark entries in the root as mount triggers */
+ if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+ __managed_dentry_set_managed(dentry);
+
+ ino = autofs4_new_ino(sbi);
if (!ino)
return ERR_PTR(-ENOMEM);
@@ -596,82 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
d_instantiate(dentry, NULL);
}
-
- if (!oz_mode) {
- mutex_unlock(&dir->i_mutex);
- expiring = autofs4_lookup_expiring(dentry);
- if (expiring) {
- /*
- * If we are racing with expire the request might not
- * be quite complete but the directory has been removed
- * so it must have been successful, so just wait for it.
- */
- autofs4_expire_wait(expiring);
- autofs4_del_expiring(expiring);
- dput(expiring);
- }
-
- spin_lock(&sbi->fs_lock);
- ino->flags |= AUTOFS_INF_PENDING;
- spin_unlock(&sbi->fs_lock);
- if (dentry->d_op && dentry->d_op->d_revalidate)
- (dentry->d_op->d_revalidate)(dentry, nd);
- mutex_lock(&dir->i_mutex);
- }
-
- /*
- * If we are still pending, check if we had to handle
- * a signal. If so we can force a restart..
- */
- if (ino->flags & AUTOFS_INF_PENDING) {
- /* See if we were interrupted */
- if (signal_pending(current)) {
- sigset_t *sigset = &current->pending.signal;
- if (sigismember (sigset, SIGKILL) ||
- sigismember (sigset, SIGQUIT) ||
- sigismember (sigset, SIGINT)) {
- if (active)
- dput(active);
- return ERR_PTR(-ERESTARTNOINTR);
- }
- }
- if (!oz_mode) {
- spin_lock(&sbi->fs_lock);
- ino->flags &= ~AUTOFS_INF_PENDING;
- spin_unlock(&sbi->fs_lock);
- }
- }
-
- /*
- * If this dentry is unhashed, then we shouldn't honour this
- * lookup. Returning ENOENT here doesn't do the right thing
- * for all system calls, but it should be OK for the operations
- * we permit from an autofs.
- */
- if (!oz_mode && d_unhashed(dentry)) {
- /*
- * A user space application can (and has done in the past)
- * remove and re-create this directory during the callback.
- * This can leave us with an unhashed dentry, but a
- * successful mount! So we need to perform another
- * cached lookup in case the dentry now exists.
- */
- struct dentry *parent = dentry->d_parent;
- struct dentry *new = d_lookup(parent, &dentry->d_name);
- if (new != NULL)
- dentry = new;
- else
- dentry = ERR_PTR(-ENOENT);
-
- if (active)
- dput(active);
-
- return dentry;
- }
-
- if (active)
- return active;
-
return NULL;
}
@@ -683,6 +532,7 @@ static int autofs4_dir_symlink(struct inode *dir,
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
struct inode *inode;
+ size_t size = strlen(symname);
char *cp;
DPRINTK("%s <- %.*s", symname,
@@ -691,45 +541,35 @@ static int autofs4_dir_symlink(struct inode *dir,
if (!autofs4_oz_mode(sbi))
return -EACCES;
- ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
- if (!ino)
- return -ENOMEM;
+ BUG_ON(!ino);
+
+ autofs4_clean_ino(ino);
autofs4_del_active(dentry);
- ino->size = strlen(symname);
- cp = kmalloc(ino->size + 1, GFP_KERNEL);
- if (!cp) {
- if (!dentry->d_fsdata)
- kfree(ino);
+ cp = kmalloc(size + 1, GFP_KERNEL);
+ if (!cp)
return -ENOMEM;
- }
strcpy(cp, symname);
- inode = autofs4_get_inode(dir->i_sb, ino);
+ inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
if (!inode) {
kfree(cp);
if (!dentry->d_fsdata)
kfree(ino);
return -ENOMEM;
}
+ inode->i_private = cp;
+ inode->i_size = size;
d_add(dentry, inode);
- if (dir == dir->i_sb->s_root->d_inode)
- d_set_d_op(dentry, &autofs4_root_dentry_operations);
- else
- d_set_d_op(dentry, &autofs4_dentry_operations);
-
- dentry->d_fsdata = ino;
- ino->dentry = dget(dentry);
+ dget(dentry);
atomic_inc(&ino->count);
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
atomic_inc(&p_ino->count);
- ino->inode = inode;
- ino->u.symlink = cp;
dir->i_mtime = CURRENT_TIME;
return 0;
@@ -782,6 +622,58 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
return 0;
}
+/*
+ * Version 4 of autofs provides a pseudo direct mount implementation
+ * that relies on directories at the leaves of a directory tree under
+ * an indirect mount to trigger mounts. To allow for this we need to
+ * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
+ * of the directory tree. There is no need to clear the automount flag
+ * following a mount or restore it after an expire because these mounts
+ * are always covered. However, it is neccessary to ensure that these
+ * flags are clear on non-empty directories to avoid unnecessary calls
+ * during path walks.
+ */
+static void autofs_set_leaf_automount_flags(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ /* root and dentrys in the root are already handled */
+ if (IS_ROOT(dentry->d_parent))
+ return;
+
+ managed_dentry_set_managed(dentry);
+
+ parent = dentry->d_parent;
+ /* only consider parents below dentrys in the root */
+ if (IS_ROOT(parent->d_parent))
+ return;
+ managed_dentry_clear_managed(parent);
+ return;
+}
+
+static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
+{
+ struct list_head *d_child;
+ struct dentry *parent;
+
+ /* flags for dentrys in the root are handled elsewhere */
+ if (IS_ROOT(dentry->d_parent))
+ return;
+
+ managed_dentry_clear_managed(dentry);
+
+ parent = dentry->d_parent;
+ /* only consider parents below dentrys in the root */
+ if (IS_ROOT(parent->d_parent))
+ return;
+ d_child = &dentry->d_u.d_child;
+ /* Set parent managed if it's becoming empty */
+ if (d_child->next == &parent->d_subdirs &&
+ d_child->prev == &parent->d_subdirs)
+ managed_dentry_set_managed(parent);
+ return;
+}
+
static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -809,6 +701,9 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
spin_unlock(&autofs4_lock);
+ if (sbi->version < 5)
+ autofs_clear_leaf_automount_flags(dentry);
+
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
@@ -837,32 +732,25 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
DPRINTK("dentry %p, creating %.*s",
dentry, dentry->d_name.len, dentry->d_name.name);
- ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
- if (!ino)
- return -ENOMEM;
+ BUG_ON(!ino);
+
+ autofs4_clean_ino(ino);
autofs4_del_active(dentry);
- inode = autofs4_get_inode(dir->i_sb, ino);
- if (!inode) {
- if (!dentry->d_fsdata)
- kfree(ino);
+ inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555);
+ if (!inode)
return -ENOMEM;
- }
d_add(dentry, inode);
- if (dir == dir->i_sb->s_root->d_inode)
- d_set_d_op(dentry, &autofs4_root_dentry_operations);
- else
- d_set_d_op(dentry, &autofs4_dentry_operations);
+ if (sbi->version < 5)
+ autofs_set_leaf_automount_flags(dentry);
- dentry->d_fsdata = ino;
- ino->dentry = dget(dentry);
+ dget(dentry);
atomic_inc(&ino->count);
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
atomic_inc(&p_ino->count);
- ino->inode = inode;
inc_nlink(dir);
dir->i_mtime = CURRENT_TIME;
@@ -944,8 +832,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
int is_autofs4_dentry(struct dentry *dentry)
{
return dentry && dentry->d_inode &&
- (dentry->d_op == &autofs4_root_dentry_operations ||
- dentry->d_op == &autofs4_dentry_operations) &&
+ dentry->d_op == &autofs4_dentry_operations &&
dentry->d_fsdata != NULL;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index b4ea82934d2e..f27c094a1919 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,8 +14,7 @@
static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
- nd_set_link(nd, (char *)ino->u.symlink);
+ nd_set_link(nd, dentry->d_inode->i_private);
return NULL;
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c5f8459c905e..56010056b2e6 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -309,6 +309,9 @@ static int validate_request(struct autofs_wait_queue **wait,
* completed while we waited on the mutex ...
*/
if (notify == NFY_MOUNT) {
+ struct dentry *new = NULL;
+ int valid = 1;
+
/*
* If the dentry was successfully mounted while we slept
* on the wait queue mutex we can return success. If it
@@ -316,8 +319,20 @@ static int validate_request(struct autofs_wait_queue **wait,
* a multi-mount with no mount at it's base) we can
* continue on and create a new request.
*/
+ if (!IS_ROOT(dentry)) {
+ if (dentry->d_inode && d_unhashed(dentry)) {
+ struct dentry *parent = dentry->d_parent;
+ new = d_lookup(parent, &dentry->d_name);
+ if (new)
+ dentry = new;
+ }
+ }
if (have_submounts(dentry))
- return 0;
+ valid = 0;
+
+ if (new)
+ dput(new);
+ return valid;
}
return 1;
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 6cb84d896d05..27223878ba9f 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -102,22 +102,22 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
}
static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
+fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n)
{
befs_data_stream data;
int i;
for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i)
- data.direct[i] = fsrun_to_cpu(sb, n.direct[i]);
+ data.direct[i] = fsrun_to_cpu(sb, n->direct[i]);
- data.max_direct_range = fs64_to_cpu(sb, n.max_direct_range);
- data.indirect = fsrun_to_cpu(sb, n.indirect);
- data.max_indirect_range = fs64_to_cpu(sb, n.max_indirect_range);
- data.double_indirect = fsrun_to_cpu(sb, n.double_indirect);
+ data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range);
+ data.indirect = fsrun_to_cpu(sb, n->indirect);
+ data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range);
+ data.double_indirect = fsrun_to_cpu(sb, n->double_indirect);
data.max_double_indirect_range = fs64_to_cpu(sb,
- n.
+ n->
max_double_indirect_range);
- data.size = fs64_to_cpu(sb, n.size);
+ data.size = fs64_to_cpu(sb, n->size);
return data;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index de93581b79a2..b1d0c794747b 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -390,7 +390,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
int num_blks;
befs_ino->i_data.ds =
- fsds_to_cpu(sb, raw_inode->data.datastream);
+ fsds_to_cpu(sb, &raw_inode->data.datastream);
num_blks = befs_count_blocks(sb, &befs_ino->i_data.ds);
inode->i_blocks =
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6884e198e0c7..d5b640ba6cb1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -66,12 +66,11 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
static struct linux_binfmt elf_format = {
- .module = THIS_MODULE,
- .load_binary = load_elf_binary,
- .load_shlib = load_elf_library,
- .core_dump = elf_core_dump,
- .min_coredump = ELF_EXEC_PAGESIZE,
- .hasvdso = 1
+ .module = THIS_MODULE,
+ .load_binary = load_elf_binary,
+ .load_shlib = load_elf_library,
+ .core_dump = elf_core_dump,
+ .min_coredump = ELF_EXEC_PAGESIZE,
};
#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
@@ -316,8 +315,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return 0;
}
-#ifndef elf_map
-
static unsigned long elf_map(struct file *filep, unsigned long addr,
struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -354,8 +351,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-#endif /* !elf_map */
-
static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
@@ -421,7 +416,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
goto out;
retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
- (char *)elf_phdata,size);
+ (char *)elf_phdata, size);
error = -EIO;
if (retval != size) {
if (retval < 0)
@@ -601,7 +596,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto out;
if (!elf_check_arch(&loc->elf_ex))
goto out;
- if (!bprm->file->f_op||!bprm->file->f_op->mmap)
+ if (!bprm->file->f_op || !bprm->file->f_op->mmap)
goto out;
/* Now read in all of the header information */
@@ -761,8 +756,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
/* There was a PT_LOAD segment with p_memsz > p_filesz
before this one. Map anonymous pages, if needed,
and clear the area. */
- retval = set_brk (elf_bss + load_bias,
- elf_brk + load_bias);
+ retval = set_brk(elf_bss + load_bias,
+ elf_brk + load_bias);
if (retval) {
send_sig(SIGKILL, current, 0);
goto out_free_dentry;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4d0ff5ee27b8..e49cce234c65 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -782,7 +782,12 @@ void __init bio_integrity_init(void)
{
unsigned int i;
- kintegrityd_wq = create_workqueue("kintegrityd");
+ /*
+ * kintegrityd won't block much but may burn a lot of CPU cycles.
+ * Make it highpri CPU intensive wq with max concurrency of 1.
+ */
+ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
if (!kintegrityd_wq)
panic("Failed to create kintegrityd\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 771f23527010..333a7bb4cb9c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -433,7 +433,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_list);
+ INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif
inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */
@@ -473,7 +473,7 @@ static const struct super_operations bdev_sops = {
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+ return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576);
}
static struct file_system_type bd_type = {
@@ -669,7 +669,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
else if (bdev->bd_contains == bdev)
return true; /* is a whole device which isn't held */
- else if (whole->bd_holder == bd_claim)
+ else if (whole->bd_holder == bd_may_claim)
return true; /* is a partition of a device that is being partitioned */
else if (whole->bd_holder != NULL)
return false; /* is a partition of a held device */
@@ -781,439 +781,142 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
}
}
-/* releases bdev_lock */
-static void __bd_abort_claiming(struct block_device *whole, void *holder)
-{
- BUG_ON(whole->bd_claiming != holder);
- whole->bd_claiming = NULL;
- wake_up_bit(&whole->bd_claiming, 0);
-
- spin_unlock(&bdev_lock);
- bdput(whole);
-}
-
-/**
- * bd_abort_claiming - abort claiming a block device
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Abort a claiming block started by bd_start_claiming(). Note that
- * @whole is not the block device to be claimed but the whole device
- * returned by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_abort_claiming(struct block_device *whole, void *holder)
-{
- spin_lock(&bdev_lock);
- __bd_abort_claiming(whole, holder); /* releases bdev_lock */
-}
-
-/* increment holders when we have a legitimate claim. requires bdev_lock */
-static void __bd_claim(struct block_device *bdev, struct block_device *whole,
- void *holder)
-{
- /* note that for a whole device bd_holders
- * will be incremented twice, and bd_holder will
- * be set to bd_claim before being set to holder
- */
- whole->bd_holders++;
- whole->bd_holder = bd_claim;
- bdev->bd_holders++;
- bdev->bd_holder = holder;
-}
-
-/**
- * bd_finish_claiming - finish claiming a block device
- * @bdev: block device of interest (passed to bd_start_claiming())
- * @whole: whole block device returned by bd_start_claiming()
- * @holder: holder trying to claim @bdev
- *
- * Finish a claiming block started by bd_start_claiming().
- *
- * CONTEXT:
- * Grabs and releases bdev_lock.
- */
-static void bd_finish_claiming(struct block_device *bdev,
- struct block_device *whole, void *holder)
-{
- spin_lock(&bdev_lock);
- BUG_ON(!bd_may_claim(bdev, whole, holder));
- __bd_claim(bdev, whole, holder);
- __bd_abort_claiming(whole, holder); /* not actually an abort */
-}
+#ifdef CONFIG_SYSFS
+struct bd_holder_disk {
+ struct list_head list;
+ struct gendisk *disk;
+ int refcnt;
+};
-/**
- * bd_claim - claim a block device
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+ struct gendisk *disk)
{
- struct block_device *whole = bdev->bd_contains;
- int res;
+ struct bd_holder_disk *holder;
- might_sleep();
-
- spin_lock(&bdev_lock);
- res = bd_prepare_to_claim(bdev, whole, holder);
- if (res == 0)
- __bd_claim(bdev, whole, holder);
- spin_unlock(&bdev_lock);
-
- return res;
-}
-EXPORT_SYMBOL(bd_claim);
-
-void bd_release(struct block_device *bdev)
-{
- spin_lock(&bdev_lock);
- if (!--bdev->bd_contains->bd_holders)
- bdev->bd_contains->bd_holder = NULL;
- if (!--bdev->bd_holders)
- bdev->bd_holder = NULL;
- spin_unlock(&bdev_lock);
+ list_for_each_entry(holder, &bdev->bd_holder_disks, list)
+ if (holder->disk == disk)
+ return holder;
+ return NULL;
}
-EXPORT_SYMBOL(bd_release);
-
-#ifdef CONFIG_SYSFS
-/*
- * Functions for bd_claim_by_kobject / bd_release_from_kobject
- *
- * If a kobject is passed to bd_claim_by_kobject()
- * and the kobject has a parent directory,
- * following symlinks are created:
- * o from the kobject to the claimed bdev
- * o from "holders" directory of the bdev to the parent of the kobject
- * bd_release_from_kobject() removes these symlinks.
- *
- * Example:
- * If /dev/dm-0 maps to /dev/sda, kobject corresponding to
- * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
- * /sys/block/dm-0/slaves/sda --> /sys/block/sda
- * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- */
-
static int add_symlink(struct kobject *from, struct kobject *to)
{
- if (!from || !to)
- return 0;
return sysfs_create_link(from, to, kobject_name(to));
}
static void del_symlink(struct kobject *from, struct kobject *to)
{
- if (!from || !to)
- return;
sysfs_remove_link(from, kobject_name(to));
}
-/*
- * 'struct bd_holder' contains pointers to kobjects symlinked by
- * bd_claim_by_kobject.
- * It's connected to bd_holder_list which is protected by bdev->bd_sem.
- */
-struct bd_holder {
- struct list_head list; /* chain of holders of the bdev */
- int count; /* references from the holder */
- struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */
- struct kobject *hdev; /* e.g. "/block/dm-0" */
- struct kobject *hdir; /* e.g. "/block/sda/holders" */
- struct kobject *sdev; /* e.g. "/block/sda" */
-};
-
-/*
- * Get references of related kobjects at once.
- * Returns 1 on success. 0 on failure.
- *
- * Should call bd_holder_release_dirs() after successful use.
- */
-static int bd_holder_grab_dirs(struct block_device *bdev,
- struct bd_holder *bo)
-{
- if (!bdev || !bo)
- return 0;
-
- bo->sdir = kobject_get(bo->sdir);
- if (!bo->sdir)
- return 0;
-
- bo->hdev = kobject_get(bo->sdir->parent);
- if (!bo->hdev)
- goto fail_put_sdir;
-
- bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
- if (!bo->sdev)
- goto fail_put_hdev;
-
- bo->hdir = kobject_get(bdev->bd_part->holder_dir);
- if (!bo->hdir)
- goto fail_put_sdev;
-
- return 1;
-
-fail_put_sdev:
- kobject_put(bo->sdev);
-fail_put_hdev:
- kobject_put(bo->hdev);
-fail_put_sdir:
- kobject_put(bo->sdir);
-
- return 0;
-}
-
-/* Put references of related kobjects at once. */
-static void bd_holder_release_dirs(struct bd_holder *bo)
-{
- kobject_put(bo->hdir);
- kobject_put(bo->sdev);
- kobject_put(bo->hdev);
- kobject_put(bo->sdir);
-}
-
-static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
-{
- struct bd_holder *bo;
-
- bo = kzalloc(sizeof(*bo), GFP_KERNEL);
- if (!bo)
- return NULL;
-
- bo->count = 1;
- bo->sdir = kobj;
-
- return bo;
-}
-
-static void free_bd_holder(struct bd_holder *bo)
-{
- kfree(bo);
-}
-
/**
- * find_bd_holder - find matching struct bd_holder from the block device
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
*
- * @bdev: struct block device to be searched
- * @bo: target struct bd_holder
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
*
- * Returns matching entry with @bo in @bdev->bd_holder_list.
- * If found, increment the reference count and return the pointer.
- * If not found, returns NULL.
- */
-static struct bd_holder *find_bd_holder(struct block_device *bdev,
- struct bd_holder *bo)
-{
- struct bd_holder *tmp;
-
- list_for_each_entry(tmp, &bdev->bd_holder_list, list)
- if (tmp->sdir == bo->sdir) {
- tmp->count++;
- return tmp;
- }
-
- return NULL;
-}
-
-/**
- * add_bd_holder - create sysfs symlinks for bd_claim() relationship
+ * This functions creates the following sysfs symlinks.
+ *
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
+ *
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
*
- * @bdev: block device to be bd_claimed
- * @bo: preallocated and initialized by alloc_bd_holder()
+ * /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
*
- * Add @bo to @bdev->bd_holder_list, create symlinks.
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
*
- * Returns 0 if symlinks are created.
- * Returns -ve if something fails.
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
*/
-static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
- int err;
+ struct bd_holder_disk *holder;
+ int ret = 0;
- if (!bo)
- return -EINVAL;
+ mutex_lock(&bdev->bd_mutex);
- if (!bd_holder_grab_dirs(bdev, bo))
- return -EBUSY;
+ WARN_ON_ONCE(!bdev->bd_holder);
- err = add_symlink(bo->sdir, bo->sdev);
- if (err)
- return err;
+ /* FIXME: remove the following once add_disk() handles errors */
+ if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+ goto out_unlock;
- err = add_symlink(bo->hdir, bo->hdev);
- if (err) {
- del_symlink(bo->sdir, bo->sdev);
- return err;
+ holder = bd_find_holder_disk(bdev, disk);
+ if (holder) {
+ holder->refcnt++;
+ goto out_unlock;
}
- list_add_tail(&bo->list, &bdev->bd_holder_list);
- return 0;
-}
-
-/**
- * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
- *
- * @bdev: block device to be bd_claimed
- * @kobj: holder's kobject
- *
- * If there is matching entry with @kobj in @bdev->bd_holder_list
- * and no other bd_claim() from the same kobject,
- * remove the struct bd_holder from the list, delete symlinks for it.
- *
- * Returns a pointer to the struct bd_holder when it's removed from the list
- * and ready to be freed.
- * Returns NULL if matching claim isn't found or there is other bd_claim()
- * by the same kobject.
- */
-static struct bd_holder *del_bd_holder(struct block_device *bdev,
- struct kobject *kobj)
-{
- struct bd_holder *bo;
-
- list_for_each_entry(bo, &bdev->bd_holder_list, list) {
- if (bo->sdir == kobj) {
- bo->count--;
- BUG_ON(bo->count < 0);
- if (!bo->count) {
- list_del(&bo->list);
- del_symlink(bo->sdir, bo->sdev);
- del_symlink(bo->hdir, bo->hdev);
- bd_holder_release_dirs(bo);
- return bo;
- }
- break;
- }
+ holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+ if (!holder) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- return NULL;
-}
-
-/**
- * bd_claim_by_kobject - bd_claim() with additional kobject signature
- *
- * @bdev: block device to be claimed
- * @holder: holder's signature
- * @kobj: holder's kobject
- *
- * Do bd_claim() and if it succeeds, create sysfs symlinks between
- * the bdev and the holder's kobject.
- * Use bd_release_from_kobject() when relesing the claimed bdev.
- *
- * Returns 0 on success. (same as bd_claim())
- * Returns errno on failure.
- */
-static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
- struct kobject *kobj)
-{
- int err;
- struct bd_holder *bo, *found;
-
- if (!kobj)
- return -EINVAL;
-
- bo = alloc_bd_holder(kobj);
- if (!bo)
- return -ENOMEM;
+ INIT_LIST_HEAD(&holder->list);
+ holder->disk = disk;
+ holder->refcnt = 1;
- mutex_lock(&bdev->bd_mutex);
+ ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+ if (ret)
+ goto out_free;
- err = bd_claim(bdev, holder);
- if (err)
- goto fail;
+ ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+ if (ret)
+ goto out_del;
- found = find_bd_holder(bdev, bo);
- if (found)
- goto fail;
+ list_add(&holder->list, &bdev->bd_holder_disks);
+ goto out_unlock;
- err = add_bd_holder(bdev, bo);
- if (err)
- bd_release(bdev);
- else
- bo = NULL;
-fail:
+out_del:
+ del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+out_free:
+ kfree(holder);
+out_unlock:
mutex_unlock(&bdev->bd_mutex);
- free_bd_holder(bo);
- return err;
+ return ret;
}
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
/**
- * bd_release_from_kobject - bd_release() with additional kobject signature
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
*
- * @bdev: block device to be released
- * @kobj: holder's kobject
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
*
- * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ * CONTEXT:
+ * Might sleep.
*/
-static void bd_release_from_kobject(struct block_device *bdev,
- struct kobject *kobj)
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
- if (!kobj)
- return;
+ struct bd_holder_disk *holder;
mutex_lock(&bdev->bd_mutex);
- bd_release(bdev);
- free_bd_holder(del_bd_holder(bdev, kobj));
- mutex_unlock(&bdev->bd_mutex);
-}
-/**
- * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
- *
- * @bdev: block device to be claimed
- * @holder: holder's signature
- * @disk: holder's gendisk
- *
- * Call bd_claim_by_kobject() with getting @disk->slave_dir.
- */
-int bd_claim_by_disk(struct block_device *bdev, void *holder,
- struct gendisk *disk)
-{
- return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
-}
-EXPORT_SYMBOL_GPL(bd_claim_by_disk);
+ holder = bd_find_holder_disk(bdev, disk);
-/**
- * bd_release_from_disk - wrapper function for bd_release_from_kobject()
- *
- * @bdev: block device to be claimed
- * @disk: holder's gendisk
- *
- * Call bd_release_from_kobject() and put @disk->slave_dir.
- */
-void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
-{
- bd_release_from_kobject(bdev, disk->slave_dir);
- kobject_put(disk->slave_dir);
-}
-EXPORT_SYMBOL_GPL(bd_release_from_disk);
-#endif
+ if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+ del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+ del_symlink(bdev->bd_part->holder_dir,
+ &disk_to_dev(disk)->kobj);
+ list_del_init(&holder->list);
+ kfree(holder);
+ }
-/*
- * Tries to open block device by device number. Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number. _Never_
- * to be used for internal purposes. If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
-{
- struct block_device *bdev = bdget(dev);
- int err = -ENOMEM;
- if (bdev)
- err = blkdev_get(bdev, mode);
- return err ? ERR_PTR(err) : bdev;
+ mutex_unlock(&bdev->bd_mutex);
}
-
-EXPORT_SYMBOL(open_by_devnum);
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#endif
/**
* flush_disk - invalidates all buffer-cache entries on a disk
@@ -1309,10 +1012,11 @@ int check_disk_change(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
const struct block_device_operations *bdops = disk->fops;
+ unsigned int events;
- if (!bdops->media_changed)
- return 0;
- if (!bdops->media_changed(bdev->bd_disk))
+ events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+ DISK_EVENT_EJECT_REQUEST);
+ if (!(events & DISK_EVENT_MEDIA_CHANGE))
return 0;
flush_disk(bdev);
@@ -1475,17 +1179,171 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
return ret;
}
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access. Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid. Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged. On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- return __blkdev_get(bdev, mode, 0);
+ struct block_device *whole = NULL;
+ int res;
+
+ WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+ if ((mode & FMODE_EXCL) && holder) {
+ whole = bd_start_claiming(bdev, holder);
+ if (IS_ERR(whole)) {
+ bdput(bdev);
+ return PTR_ERR(whole);
+ }
+ }
+
+ res = __blkdev_get(bdev, mode, 0);
+
+ /* __blkdev_get() may alter read only status, check it afterwards */
+ if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+ __blkdev_put(bdev, mode, 0);
+ res = -EACCES;
+ }
+
+ if (whole) {
+ /* finish claiming */
+ mutex_lock(&bdev->bd_mutex);
+ spin_lock(&bdev_lock);
+
+ if (!res) {
+ BUG_ON(!bd_may_claim(bdev, whole, holder));
+ /*
+ * Note that for a whole device bd_holders
+ * will be incremented twice, and bd_holder
+ * will be set to bd_may_claim before being
+ * set to holder
+ */
+ whole->bd_holders++;
+ whole->bd_holder = bd_may_claim;
+ bdev->bd_holders++;
+ bdev->bd_holder = holder;
+ }
+
+ /* tell others that we're done */
+ BUG_ON(whole->bd_claiming != holder);
+ whole->bd_claiming = NULL;
+ wake_up_bit(&whole->bd_claiming, 0);
+
+ spin_unlock(&bdev_lock);
+
+ /*
+ * Block event polling for write claims. Any write
+ * holder makes the write_holder state stick until all
+ * are released. This is good enough and tracking
+ * individual writeable reference is too fragile given
+ * the way @mode is used in blkdev_get/put().
+ */
+ if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+ bdev->bd_write_holder = true;
+ disk_block_events(bdev->bd_disk);
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(whole);
+ }
+
+ return res;
}
EXPORT_SYMBOL(blkdev_get);
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path. @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+ void *holder)
+{
+ struct block_device *bdev;
+ int err;
+
+ bdev = lookup_bdev(path);
+ if (IS_ERR(bdev))
+ return bdev;
+
+ err = blkdev_get(bdev, mode, holder);
+ if (err)
+ return ERR_PTR(err);
+
+ return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev. @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number. _Never_ to be used for internal purposes. If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+ struct block_device *bdev;
+ int err;
+
+ bdev = bdget(dev);
+ if (!bdev)
+ return ERR_PTR(-ENOMEM);
+
+ err = blkdev_get(bdev, mode, holder);
+ if (err)
+ return ERR_PTR(err);
+
+ return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
static int blkdev_open(struct inode * inode, struct file * filp)
{
- struct block_device *whole = NULL;
struct block_device *bdev;
- int res;
/*
* Preserve backwards compatibility and allow large file access
@@ -1506,26 +1364,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
if (bdev == NULL)
return -ENOMEM;
- if (filp->f_mode & FMODE_EXCL) {
- whole = bd_start_claiming(bdev, filp);
- if (IS_ERR(whole)) {
- bdput(bdev);
- return PTR_ERR(whole);
- }
- }
-
filp->f_mapping = bdev->bd_inode->i_mapping;
- res = blkdev_get(bdev, filp->f_mode);
-
- if (whole) {
- if (res == 0)
- bd_finish_claiming(bdev, whole, filp);
- else
- bd_abort_claiming(whole, filp);
- }
-
- return res;
+ return blkdev_get(bdev, filp->f_mode, filp);
}
static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1539,6 +1380,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_part_count--;
if (!--bdev->bd_openers) {
+ WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
}
@@ -1569,6 +1411,44 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
int blkdev_put(struct block_device *bdev, fmode_t mode)
{
+ if (mode & FMODE_EXCL) {
+ bool bdev_free;
+
+ /*
+ * Release a claim on the device. The holder fields
+ * are protected with bdev_lock. bd_mutex is to
+ * synchronize disk_holder unlinking.
+ */
+ mutex_lock(&bdev->bd_mutex);
+ spin_lock(&bdev_lock);
+
+ WARN_ON_ONCE(--bdev->bd_holders < 0);
+ WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+
+ /* bd_contains might point to self, check in a separate step */
+ if ((bdev_free = !bdev->bd_holders))
+ bdev->bd_holder = NULL;
+ if (!bdev->bd_contains->bd_holders)
+ bdev->bd_contains->bd_holder = NULL;
+
+ spin_unlock(&bdev_lock);
+
+ /*
+ * If this was the last claim, remove holder link and
+ * unblock evpoll if it was a write holder.
+ */
+ if (bdev_free) {
+ if (bdev->bd_write_holder) {
+ disk_unblock_events(bdev->bd_disk);
+ bdev->bd_write_holder = false;
+ } else
+ disk_check_events(bdev->bd_disk);
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ } else
+ disk_check_events(bdev->bd_disk);
+
return __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);
@@ -1576,8 +1456,7 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
- if (bdev->bd_holder == filp)
- bd_release(bdev);
+
return blkdev_put(bdev, filp->f_mode);
}
@@ -1722,67 +1601,6 @@ fail:
}
EXPORT_SYMBOL(lookup_bdev);
-/**
- * open_bdev_exclusive - open a block device by name and set it up for use
- *
- * @path: special file representing the block device
- * @mode: FMODE_... combination to pass be used
- * @holder: owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
- struct block_device *bdev, *whole;
- int error;
-
- bdev = lookup_bdev(path);
- if (IS_ERR(bdev))
- return bdev;
-
- whole = bd_start_claiming(bdev, holder);
- if (IS_ERR(whole)) {
- bdput(bdev);
- return whole;
- }
-
- error = blkdev_get(bdev, mode);
- if (error)
- goto out_abort_claiming;
-
- error = -EACCES;
- if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
- goto out_blkdev_put;
-
- bd_finish_claiming(bdev, whole, holder);
- return bdev;
-
-out_blkdev_put:
- blkdev_put(bdev, mode);
-out_abort_claiming:
- bd_abort_claiming(whole, holder);
- return ERR_PTR(error);
-}
-
-EXPORT_SYMBOL(open_bdev_exclusive);
-
-/**
- * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev: blockdevice to close
- * @mode: mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
- bd_release(bdev);
- blkdev_put(bdev, mode);
-}
-
-EXPORT_SYMBOL(close_bdev_exclusive);
-
int __invalidate_device(struct block_device *bdev)
{
struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
help
Btrfs is a new filesystem with extents, writable snapshotting,
support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
- export.o tree-log.o acl.o free-space-cache.o zlib.o \
+ export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ae2c8cac9d5..15b5ca2a2606 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, value, size);
if (size > 0) {
acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
+ if (IS_ERR(acl)) {
+ kfree(value);
return acl;
+ }
set_cached_acl(inode, type, acl);
}
kfree(value);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,7 +157,7 @@ struct btrfs_inode {
/*
* always compress this one file
*/
- unsigned force_compress:1;
+ unsigned force_compress:4;
struct inode vfs_inode;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b50bc4bd5c56..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
/* number of bytes on disk */
unsigned long compressed_len;
+ /* the compression algorithm for this bio */
+ int compress_type;
+
/* number of compressed pages in the array */
unsigned long nr_pages;
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
- ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
- cb->start,
- cb->orig_bio->bi_io_vec,
- cb->orig_bio->bi_vcnt,
- cb->compressed_len);
+ ret = btrfs_decompress_biovec(cb->compress_type,
+ cb->compressed_pages,
+ cb->start,
+ cb->orig_bio->bi_io_vec,
+ cb->orig_bio->bi_vcnt,
+ cb->compressed_len);
csum_failed:
if (ret)
cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->len = uncompressed_len;
cb->compressed_len = compressed_len;
+ cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_put(comp_bio);
return 0;
}
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+struct btrfs_compress_op *btrfs_compress_op[] = {
+ &btrfs_zlib_compress,
+ &btrfs_lzo_compress,
+};
+
+int __init btrfs_init_compress(void)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+ INIT_LIST_HEAD(&comp_idle_workspace[i]);
+ spin_lock_init(&comp_workspace_lock[i]);
+ atomic_set(&comp_alloc_workspace[i], 0);
+ init_waitqueue_head(&comp_workspace_wait[i]);
+ }
+ return 0;
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+ struct list_head *workspace;
+ int cpus = num_online_cpus();
+ int idx = type - 1;
+
+ struct list_head *idle_workspace = &comp_idle_workspace[idx];
+ spinlock_t *workspace_lock = &comp_workspace_lock[idx];
+ atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
+ wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
+ int *num_workspace = &comp_num_workspace[idx];
+again:
+ spin_lock(workspace_lock);
+ if (!list_empty(idle_workspace)) {
+ workspace = idle_workspace->next;
+ list_del(workspace);
+ (*num_workspace)--;
+ spin_unlock(workspace_lock);
+ return workspace;
+
+ }
+ if (atomic_read(alloc_workspace) > cpus) {
+ DEFINE_WAIT(wait);
+
+ spin_unlock(workspace_lock);
+ prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+ schedule();
+ finish_wait(workspace_wait, &wait);
+ goto again;
+ }
+ atomic_inc(alloc_workspace);
+ spin_unlock(workspace_lock);
+
+ workspace = btrfs_compress_op[idx]->alloc_workspace();
+ if (IS_ERR(workspace)) {
+ atomic_dec(alloc_workspace);
+ wake_up(workspace_wait);
+ }
+ return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+ int idx = type - 1;
+ struct list_head *idle_workspace = &comp_idle_workspace[idx];
+ spinlock_t *workspace_lock = &comp_workspace_lock[idx];
+ atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
+ wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
+ int *num_workspace = &comp_num_workspace[idx];
+
+ spin_lock(workspace_lock);
+ if (*num_workspace < num_online_cpus()) {
+ list_add_tail(workspace, idle_workspace);
+ (*num_workspace)++;
+ spin_unlock(workspace_lock);
+ goto wake;
+ }
+ spin_unlock(workspace_lock);
+
+ btrfs_compress_op[idx]->free_workspace(workspace);
+ atomic_dec(alloc_workspace);
+wake:
+ if (waitqueue_active(workspace_wait))
+ wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+ struct list_head *workspace;
+ int i;
+
+ for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+ while (!list_empty(&comp_idle_workspace[i])) {
+ workspace = comp_idle_workspace[i].next;
+ list_del(workspace);
+ btrfs_compress_op[i]->free_workspace(workspace);
+ atomic_dec(&comp_alloc_workspace[i]);
+ }
+ }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated. There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read. It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = find_workspace(type);
+ if (IS_ERR(workspace))
+ return -1;
+
+ ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+ start, len, pages,
+ nr_dest_pages, out_pages,
+ total_in, total_out,
+ max_out);
+ free_workspace(type, workspace);
+ return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous. They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+ struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = find_workspace(type);
+ if (IS_ERR(workspace))
+ return -ENOMEM;
+
+ ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+ disk_start,
+ bvec, vcnt, srclen);
+ free_workspace(type, workspace);
+ return ret;
+}
+
+/*
+ * a less complex decompression routine. Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ struct list_head *workspace;
+ int ret;
+
+ workspace = find_workspace(type);
+ if (IS_ERR(workspace))
+ return -ENOMEM;
+
+ ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+ dest_page, start_byte,
+ srclen, destlen);
+
+ free_workspace(type, workspace);
+ return ret;
+}
+
+void __exit btrfs_exit_compress(void)
+{
+ free_workspaces();
+}
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+ unsigned long total_out, u64 disk_start,
+ struct bio_vec *bvec, int vcnt,
+ unsigned long *page_index,
+ unsigned long *pg_offset)
+{
+ unsigned long buf_offset;
+ unsigned long current_buf_start;
+ unsigned long start_byte;
+ unsigned long working_bytes = total_out - buf_start;
+ unsigned long bytes;
+ char *kaddr;
+ struct page *page_out = bvec[*page_index].bv_page;
+
+ /*
+ * start byte is the first byte of the page we're currently
+ * copying into relative to the start of the compressed data.
+ */
+ start_byte = page_offset(page_out) - disk_start;
+
+ /* we haven't yet hit data corresponding to this page */
+ if (total_out <= start_byte)
+ return 1;
+
+ /*
+ * the start of the data we care about is offset into
+ * the middle of our working buffer
+ */
+ if (total_out > start_byte && buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes -= buf_offset;
+ } else {
+ buf_offset = 0;
+ }
+ current_buf_start = buf_start;
+
+ /* copy bytes from the working buffer into the pages */
+ while (working_bytes > 0) {
+ bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+ PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(bytes, working_bytes);
+ kaddr = kmap_atomic(page_out, KM_USER0);
+ memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ flush_dcache_page(page_out);
+
+ *pg_offset += bytes;
+ buf_offset += bytes;
+ working_bytes -= bytes;
+ current_buf_start += bytes;
+
+ /* check if we need to pick another page */
+ if (*pg_offset == PAGE_CACHE_SIZE) {
+ (*page_index)++;
+ if (*page_index >= vcnt)
+ return 0;
+
+ page_out = bvec[*page_index].bv_page;
+ *pg_offset = 0;
+ start_byte = page_offset(page_out) - disk_start;
+
+ /*
+ * make sure our new page is covered by this
+ * working buffer
+ */
+ if (total_out <= start_byte)
+ return 1;
+
+ /*
+ * the next page in the biovec might not be adjacent
+ * to the last page, but it might still be found
+ * inside this working buffer. bump our offset pointer
+ */
+ if (total_out > start_byte &&
+ current_buf_start < start_byte) {
+ buf_offset = start_byte - buf_start;
+ working_bytes = total_out - start_byte;
+ current_buf_start = buf_start + buf_offset;
+ }
+ }
+ }
+
+ return 1;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
#ifndef __BTRFS_COMPRESSION_
#define __BTRFS_COMPRESSION_
-int btrfs_zlib_decompress(unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen);
-int btrfs_zlib_compress_pages(struct address_space *mapping,
- u64 start, unsigned long len,
- struct page **pages,
- unsigned long nr_dest_pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out,
- unsigned long max_out);
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
- u64 disk_start,
- struct bio_vec *bvec,
- int vcnt,
- size_t srclen);
-void btrfs_zlib_exit(void);
+int btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+ struct bio_vec *bvec, int vcnt, size_t srclen);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+ unsigned long total_out, u64 disk_start,
+ struct bio_vec *bvec, int vcnt,
+ unsigned long *page_index,
+ unsigned long *pg_offset);
+
int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long len, u64 disk_start,
unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long nr_pages);
int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
+
+struct btrfs_compress_op {
+ struct list_head *(*alloc_workspace)(void);
+
+ void (*free_workspace)(struct list_head *workspace);
+
+ int (*compress_pages)(struct list_head *workspace,
+ struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out);
+
+ int (*decompress_biovec)(struct list_head *workspace,
+ struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen);
+
+ int (*decompress)(struct list_head *workspace,
+ unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen);
+};
+
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ac171599258..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
+ if (!p)
+ return;
btrfs_release_path(NULL, p);
kmem_cache_free(btrfs_path_cachep, p);
}
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
right = read_node_slot(root, upper, slot + 1);
+ if (right == NULL)
+ return 1;
+
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
left = read_node_slot(root, path->nodes[1], slot - 1);
+ if (left == NULL)
+ return 1;
+
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a142d204b526..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/kobject.h>
#include <asm/kmap_types.h>
#include "extent_io.h"
#include "extent_map.h"
@@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FSID_SIZE 16
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
+
+/*
+ * File system states
+ */
+
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
+
#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
@@ -398,13 +407,15 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -551,9 +562,11 @@ struct btrfs_timespec {
} __attribute__ ((__packed__));
enum btrfs_compression_type {
- BTRFS_COMPRESS_NONE = 0,
- BTRFS_COMPRESS_ZLIB = 1,
- BTRFS_COMPRESS_LAST = 2,
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_TYPES = 2,
+ BTRFS_COMPRESS_LAST = 3,
};
struct btrfs_inode_item {
@@ -597,6 +610,8 @@ struct btrfs_dir_item {
u8 type;
} __attribute__ ((__packed__));
+#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
+
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
@@ -895,7 +910,8 @@ struct btrfs_fs_info {
*/
u64 last_trans_log_full_commit;
u64 open_ioctl_trans;
- unsigned long mount_opt;
+ unsigned long mount_opt:20;
+ unsigned long compress_type:4;
u64 max_inline;
u64 alloc_start;
struct btrfs_transaction *running_transaction;
@@ -1050,6 +1066,9 @@ struct btrfs_fs_info {
unsigned metadata_ratio;
void *bdev_holder;
+
+ /* filesystem state */
+ u64 fs_state;
};
/*
@@ -1893,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
last_snapshot, 64);
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+ return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
+
/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2145,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2188,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+ u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2541,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno);
+
+#define btrfs_std_error(fs_info, errno) \
+do { \
+ if ((errno)) \
+ __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..b531c36455d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,6 +44,20 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
/*
* end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ if (eb == NULL) {
+ WARN_ON(1);
+ goto out;
+ }
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ if (eb == NULL) {
+ ret = -EIO;
+ goto out;
+ }
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
}
btrfs_free_path(path);
if (ret) {
+ kfree(root);
if (ret > 0)
ret = -ENOENT;
return ERR_PTR(ret);
@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info, BTRFS_ROOT_TREE_OBJECTID);
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh)
+ if (!bh) {
+ err = -EINVAL;
goto fail_iput;
+ }
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!btrfs_super_root(disk_super))
goto fail_iput;
+ /* check FS state, whether FS is broken. */
+ fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+ btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
ret = btrfs_parse_options(tree_root, options);
if (ret) {
err = ret;
@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
features = btrfs_super_incompat_flags(disk_super);
- if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- btrfs_set_super_incompat_flags(disk_super, features);
- }
+ features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+ if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+ features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_set_opt(fs_info->mount_opt, SSD);
}
- if (btrfs_super_log_root(disk_super) != 0) {
+ /* do not make disk changes in broken FS */
+ if (btrfs_super_log_root(disk_super) != 0 &&
+ !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
smp_mb();
btrfs_put_block_group_cache(fs_info);
+
+ /*
+ * Here come 2 situations when btrfs is broken to flip readonly:
+ *
+ * 1. when btrfs flips readonly somewhere else before
+ * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+ * and btrfs will skip to write sb directly to keep
+ * ERROR state on disk.
+ *
+ * 2. when btrfs flips readonly just in btrfs_commit_super,
+ * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+ * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+ * btrfs will cleanup all FS resources first and write sb then.
+ */
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
- ret = btrfs_commit_super(root);
+ ret = btrfs_commit_super(root);
+ if (ret)
+ printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+ }
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ ret = btrfs_error_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
@@ -2619,6 +2671,352 @@ out:
return 0;
}
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+ int read_only)
+{
+ if (read_only)
+ return;
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ printk(KERN_WARNING "warning: mount fs with errors, "
+ "running btrfsck is recommended\n");
+}
+
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+ int ret;
+
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ down_write(&root->fs_info->cleanup_work_sem);
+ up_write(&root->fs_info->cleanup_work_sem);
+
+ /* cleanup FS via transaction */
+ btrfs_cleanup_transaction(root);
+
+ ret = write_ctree_super(NULL, root, 0);
+
+ return ret;
+}
+
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+ }
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+}
+
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+ struct list_head splice;
+ struct btrfs_ordered_extent *ordered;
+ struct inode *inode;
+
+ INIT_LIST_HEAD(&splice);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+
+ list_splice_init(&root->fs_info->ordered_extents, &splice);
+ while (!list_empty(&splice)) {
+ ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+ root_extent_list);
+
+ list_del_init(&ordered->root_extent_list);
+ atomic_inc(&ordered->refs);
+
+ /* the inode may be getting freed (in sys_unlink path). */
+ inode = igrab(ordered->inode);
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ if (inode)
+ iput(inode);
+
+ atomic_set(&ordered->refs, 1);
+ btrfs_put_ordered_extent(ordered);
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+ struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ int ret = 0;
+
+ delayed_refs = &trans->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ if (delayed_refs->num_entries == 0) {
+ printk(KERN_INFO "delayed_refs has NO entry\n");
+ return ret;
+ }
+
+ node = rb_first(&delayed_refs->root);
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+
+ atomic_set(&ref->refs, 1);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ mutex_lock(&head->mutex);
+ kfree(head->extent_op);
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+ list_del_init(&head->cluster);
+ mutex_unlock(&head->mutex);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_delayed_ref(ref);
+
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+
+ spin_unlock(&delayed_refs->lock);
+
+ return ret;
+}
+
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+ struct btrfs_pending_snapshot *snapshot;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ list_splice_init(&t->pending_snapshots, &splice);
+
+ while (!list_empty(&splice)) {
+ snapshot = list_entry(splice.next,
+ struct btrfs_pending_snapshot,
+ list);
+
+ list_del_init(&snapshot->list);
+
+ kfree(snapshot);
+ }
+
+ return 0;
+}
+
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+ spin_lock(&root->fs_info->delalloc_lock);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ delalloc_inodes);
+
+ list_del_init(&btrfs_inode->delalloc_inodes);
+
+ btrfs_invalidate_inodes(btrfs_inode->root);
+ }
+
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ return 0;
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+ struct extent_io_tree *dirty_pages,
+ int mark)
+{
+ int ret;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_buffer *eb;
+ u64 start = 0;
+ u64 end;
+ u64 offset;
+ unsigned long index;
+
+ while (1) {
+ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark);
+ if (ret)
+ break;
+
+ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+ while (start <= end) {
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_get_page(btree_inode->i_mapping, index);
+ if (!page)
+ continue;
+ offset = page_offset(page);
+
+ spin_lock(&dirty_pages->buffer_lock);
+ eb = radix_tree_lookup(
+ &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+ offset >> PAGE_CACHE_SHIFT);
+ spin_unlock(&dirty_pages->buffer_lock);
+ if (eb) {
+ ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+ &eb->bflags);
+ atomic_set(&eb->refs, 1);
+ }
+ if (PageWriteback(page))
+ end_page_writeback(page);
+
+ lock_page(page);
+ if (PageDirty(page)) {
+ clear_page_dirty_for_io(page);
+ spin_lock_irq(&page->mapping->tree_lock);
+ radix_tree_tag_clear(&page->mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irq(&page->mapping->tree_lock);
+ }
+
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ }
+ }
+
+ return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+ struct extent_io_tree *pinned_extents)
+{
+ struct extent_io_tree *unpin;
+ u64 start;
+ u64 end;
+ int ret;
+
+ unpin = pinned_extents;
+ while (1) {
+ ret = find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+
+ /* opt_discard */
+ ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+
+ clear_extent_dirty(unpin, start, end, GFP_NOFS);
+ btrfs_error_unpin_extent_range(root, start, end);
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+ struct btrfs_transaction *t;
+ LIST_HEAD(list);
+
+ WARN_ON(1);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+ list_splice_init(&root->fs_info->trans_list, &list);
+ while (!list_empty(&list)) {
+ t = list_entry(list.next, struct btrfs_transaction, list);
+ if (!t)
+ break;
+
+ btrfs_destroy_ordered_operations(root);
+
+ btrfs_destroy_ordered_extents(root);
+
+ btrfs_destroy_delayed_refs(t, root);
+
+ btrfs_block_rsv_release(root,
+ &root->fs_info->trans_block_rsv,
+ t->dirty_pages.dirty_bytes);
+
+ /* FIXME: cleanup wait for commit */
+ t->in_commit = 1;
+ t->blocked = 1;
+ if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+ wake_up(&root->fs_info->transaction_blocked_wait);
+
+ t->blocked = 0;
+ if (waitqueue_active(&root->fs_info->transaction_wait))
+ wake_up(&root->fs_info->transaction_wait);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ t->commit_done = 1;
+ if (waitqueue_active(&t->commit_wait))
+ wake_up(&t->commit_wait);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ mutex_lock(&root->fs_info->trans_mutex);
+
+ btrfs_destroy_pending_snapshots(t);
+
+ btrfs_destroy_delalloc_inodes(root);
+
+ spin_lock(&root->fs_info->new_trans_lock);
+ root->fs_info->running_transaction = NULL;
+ spin_unlock(&root->fs_info->new_trans_lock);
+
+ btrfs_destroy_marked_extents(root, &t->dirty_pages,
+ EXTENT_DIRTY);
+
+ btrfs_destroy_pinned_extent(root,
+ root->fs_info->pinned_extents);
+
+ t->use_count = 0;
+ list_del_init(&t->list);
+ memset(t, 0, sizeof(*t));
+ kmem_cache_free(btrfs_transaction_cachep, t);
+ }
+
+ mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ return 0;
+}
+
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 0ccf9a8afcdf..9786963b07e5 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
struct btrfs_root *root;
- struct dentry *dentry;
struct inode *inode;
struct btrfs_key key;
int index;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
return ERR_PTR(-ESTALE);
}
- dentry = d_obtain_alias(inode);
- if (!IS_ERR(dentry))
- d_set_d_op(dentry, &btrfs_dentry_operations);
- return dentry;
+ return d_obtain_alias(inode);
fail:
srcu_read_unlock(&fs_info->subvol_srcu, index);
return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = child->d_inode;
- struct dentry *dentry;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -223,10 +218,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
- if (!IS_ERR(dentry))
- d_set_d_op(dentry, &btrfs_dentry_operations);
- return dentry;
+ return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 227e5815d838..b55269340cec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
return btrfs_reduce_alloc_profile(root, flags);
}
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
@@ -3161,8 +3161,12 @@ alloc:
bytes + 2 * 1024 * 1024,
alloc_target, 0);
btrfs_end_transaction(trans, root);
- if (ret < 0)
- return ret;
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ return ret;
+ else
+ goto commit_trans;
+ }
if (!data_sinfo) {
btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3177,7 @@ alloc:
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
+commit_trans:
if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
return 0;
}
- WARN_ON(1);
- printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
- block_rsv->size, block_rsv->reserved,
- block_rsv->freed[0], block_rsv->freed[1]);
-
return -ENOSPC;
}
@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
sinfo->bytes_may_use + sinfo->bytes_readonly +
- cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+ cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
sinfo->bytes_reserved += cache->reserved_pinned;
cache->reserved_pinned = 0;
cache->ro = 1;
ret = 0;
}
+
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
return ret;
@@ -8012,6 +8013,62 @@ out:
return ret;
}
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+ struct btrfs_block_group_cache *block_group;
+ u64 free_bytes = 0;
+ int factor;
+
+ list_for_each_entry(block_group, groups_list, list) {
+ spin_lock(&block_group->lock);
+
+ if (!block_group->ro) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP))
+ factor = 2;
+ else
+ factor = 1;
+
+ free_bytes += (block_group->key.offset -
+ btrfs_block_group_used(&block_group->item)) *
+ factor;
+
+ spin_unlock(&block_group->lock);
+ }
+
+ return free_bytes;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+ int i;
+ u64 free_bytes = 0;
+
+ spin_lock(&sinfo->lock);
+
+ for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ if (!list_empty(&sinfo->block_groups[i]))
+ free_bytes += __btrfs_get_ro_block_group_free_space(
+ &sinfo->block_groups[i]);
+
+ spin_unlock(&sinfo->lock);
+
+ return free_bytes;
+}
+
int btrfs_set_block_group_rw(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 min_free = btrfs_block_group_used(&block_group->item);
- u64 dev_offset, max_avail;
+ u64 dev_offset;
/*
* check to make sure we can actually find a chunk with enough
@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
if (device->total_bytes > device->bytes_used + min_free) {
ret = find_free_dev_extent(NULL, device, min_free,
- &dev_offset, &max_avail);
+ &dev_offset, NULL);
if (!ret)
break;
ret = -1;
@@ -8584,3 +8641,14 @@ out:
btrfs_free_path(path);
return ret;
}
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+ return unpin_extent_range(root, start, end);
+}
+
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ return btrfs_discard_extent(root, bytenr, num_bytes);
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e86b9f36507..2e993cf1766e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
this_bio_flag = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&this_bio_flag,
+ em->compress_type);
+ }
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
#endif
eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ if (eb == NULL)
+ return NULL;
eb->start = start;
eb->len = len;
spin_lock_init(&eb->lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4183c8178f01..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,8 +20,12 @@
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
#define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
wait_queue_head_t lock_wq;
};
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+ int compress_type)
+{
+ *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+ return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
struct extent_map_tree;
static inline struct extent_state *extent_state_next(struct extent_state *state)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23cb8da3ff66..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/hardirq.h>
+#include "ctree.h"
#include "extent_map.h"
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
return em;
em->in_tree = 0;
em->flags = 0;
+ em->compress_type = BTRFS_COMPRESS_NONE;
atomic_set(&em->refs, 1);
return em;
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
- int in_tree;
+ unsigned int in_tree:1;
+ unsigned int compress_type:4;
};
struct extent_map_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..c800d58f3013 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
@@ -224,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
+ split->compress_type = em->compress_type;
ret = add_extent_mapping(em_tree, split);
BUG_ON(ret);
free_extent_map(split);
@@ -238,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->len = em->start + em->len - (start + len);
split->bdev = em->bdev;
split->flags = flags;
+ split->compress_type = em->compress_type;
if (compressed) {
split->block_len = em->block_len;
@@ -890,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
if (err)
goto out;
+ /*
+ * If BTRFS flips readonly due to some impossible error
+ * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+ * although we have opened a file as writable, we have
+ * to stop this write operation to ensure FS consistency.
+ */
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ err = -EROFS;
+ goto out;
+ }
+
file_update_time(file);
BTRFS_I(inode)->sequence++;
@@ -1237,6 +1251,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
+static long btrfs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct extent_state *cached_state = NULL;
+ u64 cur_offset;
+ u64 last_byte;
+ u64 alloc_start;
+ u64 alloc_end;
+ u64 alloc_hint = 0;
+ u64 locked_end;
+ u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+ struct extent_map *em;
+ int ret;
+
+ alloc_start = offset & ~mask;
+ alloc_end = (offset + len + mask) & ~mask;
+
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
+ mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, alloc_end);
+ if (ret)
+ goto out;
+
+ if (alloc_start > inode->i_size) {
+ ret = btrfs_cont_expand(inode, alloc_start);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
+ locked_end = alloc_end - 1;
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ /* the extent lock is ordered inside the running
+ * transaction
+ */
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+ locked_end, 0, &cached_state, GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode,
+ alloc_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset < alloc_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+ /*
+ * we can't wait on the range with the transaction
+ * running or with the extent lock held
+ */
+ btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ } else {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ }
+
+ cur_offset = alloc_start;
+ while (1) {
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+ alloc_end - cur_offset, 0);
+ BUG_ON(IS_ERR(em) || !em);
+ last_byte = min(extent_map_end(em), alloc_end);
+ last_byte = (last_byte + mask) & ~mask;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ (cur_offset >= inode->i_size &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
+ }
+ }
+ free_extent_map(em);
+
+ cur_offset = last_byte;
+ if (cur_offset >= alloc_end) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state, GFP_NOFS);
+
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -1248,6 +1373,7 @@ const struct file_operations btrfs_file_operations = {
.open = generic_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
+ .fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0ff46a47895..160b55b3e132 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
size_t datasize;
unsigned long offset;
- int use_compress = 0;
+ int compress_type = BTRFS_COMPRESS_NONE;
if (compressed_size && compressed_pages) {
- use_compress = 1;
+ compress_type = root->fs_info->compress_type;
cur_size = compressed_size;
}
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
ptr = btrfs_file_extent_inline_start(ei);
- if (use_compress) {
+ if (compress_type != BTRFS_COMPRESS_NONE) {
struct page *cpage;
int i = 0;
while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
compressed_size -= cur_size;
}
btrfs_set_file_extent_compression(leaf, ei,
- BTRFS_COMPRESS_ZLIB);
+ compress_type);
} else {
page = find_get_page(inode->i_mapping,
start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
u64 compressed_size;
struct page **pages;
unsigned long nr_pages;
+ int compress_type;
struct list_head list;
};
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
u64 start, u64 ram_size,
u64 compressed_size,
struct page **pages,
- unsigned long nr_pages)
+ unsigned long nr_pages,
+ int compress_type)
{
struct async_extent *async_extent;
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
async_extent->compressed_size = compressed_size;
async_extent->pages = pages;
async_extent->nr_pages = nr_pages;
+ async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
}
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
unsigned long max_uncompressed = 128 * 1024;
int i;
int will_compress;
+ int compress_type = root->fs_info->compress_type;
actual_end = min_t(u64, isize, end + 1);
again:
@@ -381,12 +385,16 @@ again:
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
- ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
- total_compressed, pages,
- nr_pages, &nr_pages_ret,
- &total_in,
- &total_compressed,
- max_compressed);
+ if (BTRFS_I(inode)->force_compress)
+ compress_type = BTRFS_I(inode)->force_compress;
+
+ ret = btrfs_compress_pages(compress_type,
+ inode->i_mapping, start,
+ total_compressed, pages,
+ nr_pages, &nr_pages_ret,
+ &total_in,
+ &total_compressed,
+ max_compressed);
if (!ret) {
unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
* and will submit them to the elevator.
*/
add_async_extent(async_cow, start, num_bytes,
- total_compressed, pages, nr_pages_ret);
+ total_compressed, pages, nr_pages_ret,
+ compress_type);
if (start + num_bytes < end) {
start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
__set_page_dirty_nobuffers(locked_page);
/* unlocked later on in the async handlers */
}
- add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+ add_async_extent(async_cow, start, end - start + 1,
+ 0, NULL, 0, BTRFS_COMPRESS_NONE);
*num_added += 1;
}
@@ -640,6 +650,7 @@ retry:
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+ em->compress_type = async_extent->compress_type;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -656,11 +667,13 @@ retry:
async_extent->ram_size - 1, 0);
}
- ret = btrfs_add_ordered_extent(inode, async_extent->start,
- ins.objectid,
- async_extent->ram_size,
- ins.offset,
- BTRFS_ORDERED_COMPRESSED);
+ ret = btrfs_add_ordered_extent_compress(inode,
+ async_extent->start,
+ ins.objectid,
+ async_extent->ram_size,
+ ins.offset,
+ BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
BUG_ON(ret);
/*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- int compressed = 0;
+ int compress_type = 0;
int ret;
bool nolock = false;
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
- compressed = 1;
+ compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
- BUG_ON(compressed);
+ BUG_ON(compress_type);
ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->disk_len,
ordered_extent->len,
ordered_extent->len,
- compressed, 0, 0,
+ compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ extent_set_compress_type(&failrec->bio_flags,
+ em->compress_type);
}
failrec->logical = logical;
free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
int err;
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
err = inode_change_ok(inode, attr);
if (err)
return err;
@@ -4084,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
int index;
int ret;
- d_set_d_op(dentry, &btrfs_dentry_operations);
-
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -4930,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
size_t max_size;
unsigned long inline_size;
unsigned long ptr;
+ int compress_type;
WARN_ON(pg_offset != 0);
+ compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf,
btrfs_item_nr(leaf, path->slots[0]));
@@ -4941,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
- ret = btrfs_zlib_decompress(tmp, page, extent_offset,
- inline_size, max_size);
+ ret = btrfs_decompress(compress_type, tmp, page,
+ extent_offset, inline_size, max_size);
if (ret) {
char *kaddr = kmap_atomic(page, KM_USER0);
unsigned long copy_size = min_t(u64,
@@ -4984,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
- int compressed;
+ int compress_type;
again:
read_lock(&em_tree->lock);
@@ -5043,7 +5062,7 @@ again:
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- compressed = btrfs_file_extent_compression(leaf, item);
+ compress_type = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
@@ -5089,8 +5108,9 @@ again:
em->block_start = EXTENT_MAP_HOLE;
goto insert;
}
- if (compressed) {
+ if (compress_type != BTRFS_COMPRESS_NONE) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
em->block_start = bytenr;
em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
item);
@@ -5124,12 +5144,14 @@ again:
em->len = (copy_size + root->sectorsize - 1) &
~((u64)root->sectorsize - 1);
em->orig_start = EXTENT_MAP_INLINE;
- if (compressed)
+ if (compress_type) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
- if (btrfs_file_extent_compression(leaf, item) ==
- BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_file_extent_compression(leaf, item) !=
+ BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
@@ -6479,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->ordered_data_close = 0;
ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0;
- ei->force_compress = 0;
+ ei->force_compress = BTRFS_COMPRESS_NONE;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7100,112 +7122,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
-static long btrfs_fallocate(struct inode *inode, int mode,
- loff_t offset, loff_t len)
-{
- struct extent_state *cached_state = NULL;
- u64 cur_offset;
- u64 last_byte;
- u64 alloc_start;
- u64 alloc_end;
- u64 alloc_hint = 0;
- u64 locked_end;
- u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
- struct extent_map *em;
- int ret;
-
- alloc_start = offset & ~mask;
- alloc_end = (offset + len + mask) & ~mask;
-
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-
- mutex_lock(&inode->i_mutex);
- ret = inode_newsize_ok(inode, alloc_end);
- if (ret)
- goto out;
-
- if (alloc_start > inode->i_size) {
- ret = btrfs_cont_expand(inode, alloc_start);
- if (ret)
- goto out;
- }
-
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
- if (ret)
- goto out;
-
- locked_end = alloc_end - 1;
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- /* the extent lock is ordered inside the running
- * transaction
- */
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state, GFP_NOFS);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- alloc_end - 1);
- if (ordered &&
- ordered->file_offset + ordered->len > alloc_start &&
- ordered->file_offset < alloc_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end,
- &cached_state, GFP_NOFS);
- /*
- * we can't wait on the range with the transaction
- * running or with the extent lock held
- */
- btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
- } else {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- }
-
- cur_offset = alloc_start;
- while (1) {
- em = btrfs_get_extent(inode, NULL, 0, cur_offset,
- alloc_end - cur_offset, 0);
- BUG_ON(IS_ERR(em) || !em);
- last_byte = min(extent_map_end(em), alloc_end);
- last_byte = (last_byte + mask) & ~mask;
- if (em->block_start == EXTENT_MAP_HOLE ||
- (cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- if (ret < 0) {
- free_extent_map(em);
- break;
- }
- }
- free_extent_map(em);
-
- cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
- break;
- }
- }
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state, GFP_NOFS);
-
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-out:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static int btrfs_set_page_dirty(struct page *page)
{
return __set_page_dirty_nobuffers(page);
@@ -7213,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+ return -EROFS;
if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
return -EACCES;
return generic_permission(inode, mask, flags, btrfs_check_acl);
@@ -7308,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
- .fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
static const struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7ea..a506a22b522a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int flags, oldflags;
int ret;
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
@@ -360,7 +363,8 @@ fail:
}
static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
- char *name, int namelen, u64 *async_transid)
+ char *name, int namelen, u64 *async_transid,
+ bool readonly)
{
struct inode *inode;
struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
btrfs_init_block_rsv(&pending_snapshot->block_rsv);
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
+ pending_snapshot->readonly = readonly;
trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src,
- u64 *async_transid)
+ u64 *async_transid, bool readonly)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (snap_src) {
error = create_snapshot(snap_src, dentry,
- name, namelen, async_transid);
+ name, namelen, async_transid, readonly);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct page *page;
+ struct btrfs_super_block *disk_super;
unsigned long last_index;
unsigned long ra_pages = root->fs_info->bdi.ra_pages;
unsigned long total_read = 0;
+ u64 features;
u64 page_start;
u64 page_end;
u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
u64 defrag_end = 0;
unsigned long i;
int ret;
+ int compress_type = BTRFS_COMPRESS_ZLIB;
+
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+ if (range->compress_type > BTRFS_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
if (inode->i_size == 0)
return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
total_read++;
mutex_lock(&inode->i_mutex);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
- BTRFS_I(inode)->force_compress = 1;
+ BTRFS_I(inode)->force_compress = compress_type;
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
atomic_dec(&root->fs_info->async_submit_draining);
mutex_lock(&inode->i_mutex);
- BTRFS_I(inode)->force_compress = 0;
+ BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
mutex_unlock(&inode->i_mutex);
}
+ disk_super = &root->fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (range->compress_type == BTRFS_COMPRESS_LZO) {
+ features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ }
+
return 0;
err_reservations:
@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
char *name,
unsigned long fd,
int subvol,
- u64 *transid)
+ u64 *transid,
+ bool readonly)
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct file *src_file;
@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, transid);
+ NULL, transid, readonly);
} else {
struct inode *src_inode;
src_file = fget(fd);
@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
}
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
- transid);
+ transid, readonly);
fput(src_file);
}
out:
@@ -946,58 +969,139 @@ out:
}
static noinline int btrfs_ioctl_snap_create(struct file *file,
- void __user *arg, int subvol,
- int v2)
+ void __user *arg, int subvol)
{
- struct btrfs_ioctl_vol_args *vol_args = NULL;
- struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
- char *name;
- u64 fd;
+ struct btrfs_ioctl_vol_args *vol_args;
int ret;
- if (v2) {
- u64 transid = 0;
- u64 *ptr = NULL;
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
- if (IS_ERR(vol_args_v2))
- return PTR_ERR(vol_args_v2);
+ ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+ vol_args->fd, subvol,
+ NULL, false);
- if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
- ret = -EINVAL;
- goto out;
- }
-
- name = vol_args_v2->name;
- fd = vol_args_v2->fd;
- vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ kfree(vol_args);
+ return ret;
+}
- if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
- ptr = &transid;
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+ void __user *arg, int subvol)
+{
+ struct btrfs_ioctl_vol_args_v2 *vol_args;
+ int ret;
+ u64 transid = 0;
+ u64 *ptr = NULL;
+ bool readonly = false;
- ret = btrfs_ioctl_snap_create_transid(file, name, fd,
- subvol, ptr);
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (ret == 0 && ptr &&
- copy_to_user(arg +
- offsetof(struct btrfs_ioctl_vol_args_v2,
- transid), ptr, sizeof(*ptr)))
- ret = -EFAULT;
- } else {
- vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
- name = vol_args->name;
- fd = vol_args->fd;
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
- ret = btrfs_ioctl_snap_create_transid(file, name, fd,
- subvol, NULL);
+ if (vol_args->flags &
+ ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+ ret = -EOPNOTSUPP;
+ goto out;
}
+
+ if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+ ptr = &transid;
+ if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+ readonly = true;
+
+ ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+ vol_args->fd, subvol,
+ ptr, readonly);
+
+ if (ret == 0 && ptr &&
+ copy_to_user(arg +
+ offsetof(struct btrfs_ioctl_vol_args_v2,
+ transid), ptr, sizeof(*ptr)))
+ ret = -EFAULT;
out:
kfree(vol_args);
- kfree(vol_args_v2);
+ return ret;
+}
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+ void __user *arg)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+ u64 flags = 0;
+
+ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+ return -EINVAL;
+
+ down_read(&root->fs_info->subvol_sem);
+ if (btrfs_root_readonly(root))
+ flags |= BTRFS_SUBVOL_RDONLY;
+ up_read(&root->fs_info->subvol_sem);
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+ void __user *arg)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ u64 root_flags;
+ u64 flags;
+ int ret = 0;
+
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+ return -EINVAL;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+ return -EINVAL;
+
+ if (flags & ~BTRFS_SUBVOL_RDONLY)
+ return -EOPNOTSUPP;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ /* nothing to do */
+ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+ goto out;
+
+ root_flags = btrfs_root_flags(&root->root_item);
+ if (flags & BTRFS_SUBVOL_RDONLY)
+ btrfs_set_root_flags(&root->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+ else
+ btrfs_set_root_flags(&root->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_reset;
+ }
+
+ ret = btrfs_update_root(trans, root,
+ &root->root_key, &root->root_item);
+
+ btrfs_commit_transaction(trans, root);
+out_reset:
+ if (ret)
+ btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+ up_write(&root->fs_info->subvol_sem);
return ret;
}
@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
struct btrfs_ioctl_defrag_range_args *range;
int ret;
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
ret = mnt_want_write(file->f_path.mnt);
if (ret)
return ret;
@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
return -EINVAL;
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
ret = mnt_want_write(file->f_path.mnt);
if (ret)
return ret;
@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
if (file->private_data)
goto out;
+ ret = -EROFS;
+ if (btrfs_root_readonly(root))
+ goto out;
+
ret = mnt_want_write(file->f_path.mnt);
if (ret)
goto out;
@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case BTRFS_IOC_SNAP_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 0, 0);
+ return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
- return btrfs_ioctl_snap_create(file, argp, 0, 1);
+ return btrfs_ioctl_snap_create_v2(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
- return btrfs_ioctl_snap_create(file, argp, 1, 0);
+ return btrfs_ioctl_snap_create(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
+ case BTRFS_IOC_SUBVOL_GETFLAGS:
+ return btrfs_ioctl_subvol_getflags(file, argp);
+ case BTRFS_IOC_SUBVOL_SETFLAGS:
+ return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
return btrfs_ioctl_default_subvol(file, argp);
case BTRFS_IOC_DEFRAG:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index c344d12c646b..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
};
#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
#define BTRFS_SUBVOL_NAME_MAX 4039
struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
*/
__u32 extent_thresh;
+ /*
+ * which compression method to use if turning on compression
+ * for this defrag operation. If unspecified, zlib will
+ * be used
+ */
+ __u32 compress_type;
+
/* spare for later */
- __u32 unused[5];
+ __u32 unused[4];
};
struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN 4
+
+struct workspace {
+ void *mem;
+ void *buf; /* where compressed data goes */
+ void *cbuf; /* where decompressed data goes */
+ struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+ vfree(workspace->buf);
+ vfree(workspace->cbuf);
+ vfree(workspace->mem);
+ kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+ struct workspace *workspace;
+
+ workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
+
+ workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+ workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+ workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+ if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+ goto fail;
+
+ INIT_LIST_HEAD(&workspace->list);
+
+ return &workspace->list;
+fail:
+ lzo_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+ __le32 dlen;
+
+ dlen = cpu_to_le32(len);
+ memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+ __le32 dlen;
+
+ memcpy(&dlen, buf, LZO_LEN);
+ return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+ struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0;
+ char *data_in;
+ char *cpage_out;
+ int nr_pages = 0;
+ struct page *in_page = NULL;
+ struct page *out_page = NULL;
+ unsigned long bytes_left;
+
+ size_t in_len;
+ size_t out_len;
+ char *buf;
+ unsigned long tot_in = 0;
+ unsigned long tot_out = 0;
+ unsigned long pg_bytes_left;
+ unsigned long out_offset;
+ unsigned long bytes;
+
+ *out_pages = 0;
+ *total_out = 0;
+ *total_in = 0;
+
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+
+ /*
+ * store the size of all chunks of compressed data in
+ * the first 4 bytes
+ */
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ out_offset = LZO_LEN;
+ tot_out = LZO_LEN;
+ pages[0] = out_page;
+ nr_pages = 1;
+ pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+ /* compress at most one page of data each time */
+ in_len = min(len, PAGE_CACHE_SIZE);
+ while (tot_in < len) {
+ ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+ &out_len, workspace->mem);
+ if (ret != LZO_E_OK) {
+ printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+ ret);
+ ret = -1;
+ goto out;
+ }
+
+ /* store the size of this chunk of compressed data */
+ write_compress_length(cpage_out + out_offset, out_len);
+ tot_out += LZO_LEN;
+ out_offset += LZO_LEN;
+ pg_bytes_left -= LZO_LEN;
+
+ tot_in += in_len;
+ tot_out += out_len;
+
+ /* copy bytes from the working buffer into the pages */
+ buf = workspace->cbuf;
+ while (out_len) {
+ bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+ memcpy(cpage_out + out_offset, buf, bytes);
+
+ out_len -= bytes;
+ pg_bytes_left -= bytes;
+ buf += bytes;
+ out_offset += bytes;
+
+ /*
+ * we need another page for writing out.
+ *
+ * Note if there's less than 4 bytes left, we just
+ * skip to a new page.
+ */
+ if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+ pg_bytes_left == 0) {
+ if (pg_bytes_left) {
+ memset(cpage_out + out_offset, 0,
+ pg_bytes_left);
+ tot_out += pg_bytes_left;
+ }
+
+ /* we're done, don't allocate new page */
+ if (out_len == 0 && tot_in >= len)
+ break;
+
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -1;
+ goto out;
+ }
+
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ pages[nr_pages++] = out_page;
+
+ pg_bytes_left = PAGE_CACHE_SIZE;
+ out_offset = 0;
+ }
+ }
+
+ /* we're making it bigger, give up */
+ if (tot_in > 8192 && tot_in < tot_out)
+ goto out;
+
+ /* we're all done */
+ if (tot_in >= len)
+ break;
+
+ if (tot_out > max_out)
+ break;
+
+ bytes_left = len - tot_in;
+ kunmap(in_page);
+ page_cache_release(in_page);
+
+ start += PAGE_CACHE_SIZE;
+ in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ data_in = kmap(in_page);
+ in_len = min(bytes_left, PAGE_CACHE_SIZE);
+ }
+
+ if (tot_out > tot_in)
+ goto out;
+
+ /* store the size of all chunks of compressed data */
+ cpage_out = kmap(pages[0]);
+ write_compress_length(cpage_out, tot_out);
+
+ kunmap(pages[0]);
+
+ ret = 0;
+ *total_out = tot_out;
+ *total_in = tot_in;
+out:
+ *out_pages = nr_pages;
+ if (out_page)
+ kunmap(out_page);
+
+ if (in_page) {
+ kunmap(in_page);
+ page_cache_release(in_page);
+ }
+
+ return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+ struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0, ret2;
+ char *data_in;
+ unsigned long page_in_index = 0;
+ unsigned long page_out_index = 0;
+ unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE;
+ unsigned long buf_start;
+ unsigned long buf_offset = 0;
+ unsigned long bytes;
+ unsigned long working_bytes;
+ unsigned long pg_offset;
+
+ size_t in_len;
+ size_t out_len;
+ unsigned long in_offset;
+ unsigned long in_page_bytes_left;
+ unsigned long tot_in;
+ unsigned long tot_out;
+ unsigned long tot_len;
+ char *buf;
+
+ data_in = kmap(pages_in[0]);
+ tot_len = read_compress_length(data_in);
+
+ tot_in = LZO_LEN;
+ in_offset = LZO_LEN;
+ tot_len = min_t(size_t, srclen, tot_len);
+ in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+ tot_out = 0;
+ pg_offset = 0;
+
+ while (tot_in < tot_len) {
+ in_len = read_compress_length(data_in + in_offset);
+ in_page_bytes_left -= LZO_LEN;
+ in_offset += LZO_LEN;
+ tot_in += LZO_LEN;
+
+ tot_in += in_len;
+ working_bytes = in_len;
+
+ /* fast path: avoid using the working buffer */
+ if (in_page_bytes_left >= in_len) {
+ buf = data_in + in_offset;
+ bytes = in_len;
+ goto cont;
+ }
+
+ /* copy bytes from the pages into the working buffer */
+ buf = workspace->cbuf;
+ buf_offset = 0;
+ while (working_bytes) {
+ bytes = min(working_bytes, in_page_bytes_left);
+
+ memcpy(buf + buf_offset, data_in + in_offset, bytes);
+ buf_offset += bytes;
+cont:
+ working_bytes -= bytes;
+ in_page_bytes_left -= bytes;
+ in_offset += bytes;
+
+ /* check if we need to pick another page */
+ if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+ || in_page_bytes_left == 0) {
+ tot_in += in_page_bytes_left;
+
+ if (working_bytes == 0 && tot_in >= tot_len)
+ break;
+
+ kunmap(pages_in[page_in_index]);
+ page_in_index++;
+ if (page_in_index >= total_pages_in) {
+ ret = -1;
+ data_in = NULL;
+ goto done;
+ }
+ data_in = kmap(pages_in[page_in_index]);
+
+ in_page_bytes_left = PAGE_CACHE_SIZE;
+ in_offset = 0;
+ }
+ }
+
+ out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+ ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+ &out_len);
+ if (ret != LZO_E_OK) {
+ printk(KERN_WARNING "btrfs decompress failed\n");
+ ret = -1;
+ break;
+ }
+
+ buf_start = tot_out;
+ tot_out += out_len;
+
+ ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+ tot_out, disk_start,
+ bvec, vcnt,
+ &page_out_index, &pg_offset);
+ if (ret2 == 0)
+ break;
+ }
+done:
+ if (data_in)
+ kunmap(pages_in[page_in_index]);
+ return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ size_t in_len;
+ size_t out_len;
+ size_t tot_len;
+ int ret = 0;
+ char *kaddr;
+ unsigned long bytes;
+
+ BUG_ON(srclen < LZO_LEN);
+
+ tot_len = read_compress_length(data_in);
+ data_in += LZO_LEN;
+
+ in_len = read_compress_length(data_in);
+ data_in += LZO_LEN;
+
+ out_len = PAGE_CACHE_SIZE;
+ ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+ if (ret != LZO_E_OK) {
+ printk(KERN_WARNING "btrfs decompress failed!\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (out_len < start_byte) {
+ ret = -1;
+ goto out;
+ }
+
+ bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+ kaddr = kmap_atomic(dest_page, KM_USER0);
+ memcpy(kaddr, workspace->buf + start_byte, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+out:
+ return ret;
+}
+
+struct btrfs_compress_op btrfs_lzo_compress = {
+ .alloc_workspace = lzo_alloc_workspace,
+ .free_workspace = lzo_free_workspace,
+ .compress_pages = lzo_compress_pages,
+ .decompress_biovec = lzo_decompress_biovec,
+ .decompress = lzo_decompress,
+};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ae7737e352c9..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
*/
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
- int type, int dio)
+ int type, int dio, int compress_type)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = inode;
+ entry->compress_type = compress_type;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 0);
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 1);
+ disk_len, type, 1,
+ BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int compress_type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ compress_type);
}
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 61dca83119dd..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
/* flags (described above) */
unsigned long flags;
+ /* compression algorithm */
+ int compress_type;
+
/* reference count */
atomic_t refs;
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int compress_type);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 883c6fa1367e..b2130c46fdb5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -54,6 +54,90 @@
static const struct super_operations btrfs_super_ops;
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+ char nbuf[16])
+{
+ char *errstr = NULL;
+
+ switch (errno) {
+ case -EIO:
+ errstr = "IO failure";
+ break;
+ case -ENOMEM:
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+ errstr = "Readonly filesystem";
+ break;
+ default:
+ if (nbuf) {
+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+ errstr = nbuf;
+ }
+ break;
+ }
+
+ return errstr;
+}
+
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * today we only save the error info into ram. Long term we'll
+ * also send it down to the disk
+ */
+ fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+
+/* NOTE:
+ * We move write_super stuff at umount in order to avoid deadlock
+ * for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+ __save_error_info(fs_info);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+ struct super_block *sb = fs_info->sb;
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ sb->s_flags |= MS_RDONLY;
+ printk(KERN_INFO "btrfs is forced readonly\n");
+ }
+}
+
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ unsigned int line, int errno)
+{
+ struct super_block *sb = fs_info->sb;
+ char nbuf[16];
+ const char *errstr;
+
+ /*
+ * Special case: if the error is EROFS, and we're already
+ * under MS_RDONLY, then it is safe here.
+ */
+ if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+ return;
+
+ errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ save_error_info(fs_info);
+
+ btrfs_handle_error(fs_info);
+}
+
static void btrfs_put_super(struct super_block *sb)
{
struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
- Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
- Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
- Opt_user_subvol_rm_allowed,
+ Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+ Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+ Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
};
static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
{Opt_alloc_start, "alloc_start=%s"},
{Opt_thread_pool, "thread_pool=%d"},
{Opt_compress, "compress"},
+ {Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
+ {Opt_compress_force_type, "compress-force=%s"},
{Opt_ssd, "ssd"},
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
char *p, *num, *orig;
int intarg;
int ret = 0;
+ char *compress_type;
+ bool compress_force = false;
if (!options)
return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
- case Opt_compress:
- printk(KERN_INFO "btrfs: use compression\n");
- btrfs_set_opt(info->mount_opt, COMPRESS);
- break;
case Opt_compress_force:
- printk(KERN_INFO "btrfs: forcing compression\n");
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+ case Opt_compress_force_type:
+ compress_force = true;
+ case Opt_compress:
+ case Opt_compress_type:
+ if (token == Opt_compress ||
+ token == Opt_compress_force ||
+ strcmp(args[0].from, "zlib") == 0) {
+ compress_type = "zlib";
+ info->compress_type = BTRFS_COMPRESS_ZLIB;
+ } else if (strcmp(args[0].from, "lzo") == 0) {
+ compress_type = "lzo";
+ info->compress_type = BTRFS_COMPRESS_LZO;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
btrfs_set_opt(info->mount_opt, COMPRESS);
+ if (compress_force) {
+ btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+ pr_info("btrfs: force %s compression\n",
+ compress_type);
+ } else
+ pr_info("btrfs: use %s compression\n",
+ compress_type);
break;
case Opt_ssd:
printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -460,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
+ sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
@@ -752,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_device_info *devices_info;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 skip_space;
+ u64 type;
+ u64 avail_space;
+ u64 used_space;
+ u64 min_stripe_size;
+ int min_stripes = 1;
+ int i = 0, nr_devices;
+ int ret;
+
+ nr_devices = fs_info->fs_devices->rw_devices;
+ BUG_ON(!nr_devices);
+
+ devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ /* calc min stripe number for data space alloction */
+ type = btrfs_get_alloc_profile(root, 1);
+ if (type & BTRFS_BLOCK_GROUP_RAID0)
+ min_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1)
+ min_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID10)
+ min_stripes = 4;
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+ else
+ min_stripe_size = BTRFS_STRIPE_LEN;
+
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ if (!device->in_fs_metadata)
+ continue;
+
+ avail_space = device->total_bytes - device->bytes_used;
+
+ /* align with stripe_len */
+ do_div(avail_space, BTRFS_STRIPE_LEN);
+ avail_space *= BTRFS_STRIPE_LEN;
+
+ /*
+ * In order to avoid overwritting the superblock on the drive,
+ * btrfs starts at an offset of at least 1MB when doing chunk
+ * allocation.
+ */
+ skip_space = 1024 * 1024;
+
+ /* user can set the offset in fs_info->alloc_start. */
+ if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+ device->total_bytes)
+ skip_space = max(fs_info->alloc_start, skip_space);
+
+ /*
+ * btrfs can not use the free space in [0, skip_space - 1],
+ * we must subtract it from the total. In order to implement
+ * it, we account the used space in this range first.
+ */
+ ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+ &used_space);
+ if (ret) {
+ kfree(devices_info);
+ return ret;
+ }
+
+ /* calc the free space in [0, skip_space - 1] */
+ skip_space -= used_space;
+
+ /*
+ * we can use the free space in [0, skip_space - 1], subtract
+ * it from the total.
+ */
+ if (avail_space && avail_space >= skip_space)
+ avail_space -= skip_space;
+ else
+ avail_space = 0;
+
+ if (avail_space < min_stripe_size)
+ continue;
+
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail_space;
+
+ i++;
+ }
+
+ nr_devices = i;
+
+ btrfs_descending_sort_devices(devices_info, nr_devices);
+
+ i = nr_devices - 1;
+ avail_space = 0;
+ while (nr_devices >= min_stripes) {
+ if (devices_info[i].max_avail >= min_stripe_size) {
+ int j;
+ u64 alloc_size;
+
+ avail_space += devices_info[i].max_avail * min_stripes;
+ alloc_size = devices_info[i].max_avail;
+ for (j = i + 1 - min_stripes; j <= i; j++)
+ devices_info[j].max_avail -= alloc_size;
+ }
+ i--;
+ nr_devices--;
+ }
+
+ kfree(devices_info);
+ *free_bytes = avail_space;
+ return 0;
+}
+
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -759,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
- u64 total_used_data = 0;
+ u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
+ int ret;
+ /* holding chunk_muext to avoid allocating new chunks */
+ mutex_lock(&root->fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
- if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
- BTRFS_BLOCK_GROUP_SYSTEM))
- total_used_data += found->disk_total;
- else
- total_used_data += found->disk_used;
+ if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+ total_free_data += found->disk_total - found->disk_used;
+ total_free_data -=
+ btrfs_account_ro_block_groups_free_space(found);
+ }
+
total_used += found->disk_used;
}
rcu_read_unlock();
@@ -777,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
+ buf->f_bavail = total_free_data;
+ ret = btrfs_calc_avail_data_space(root, &total_free_data);
+ if (ret) {
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ return ret;
+ }
+ buf->f_bavail += total_free_data;
+ buf->f_bavail = buf->f_bavail >> bits;
+ mutex_unlock(&root->fs_info->chunk_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
@@ -896,10 +1136,14 @@ static int __init init_btrfs_fs(void)
if (err)
return err;
- err = btrfs_init_cachep();
+ err = btrfs_init_compress();
if (err)
goto free_sysfs;
+ err = btrfs_init_cachep();
+ if (err)
+ goto free_compress;
+
err = extent_io_init();
if (err)
goto free_cachep;
@@ -927,6 +1171,8 @@ free_extent_io:
extent_io_exit();
free_cachep:
btrfs_destroy_cachep();
+free_compress:
+ btrfs_exit_compress();
free_sysfs:
btrfs_exit_sysfs();
return err;
@@ -941,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
- btrfs_zlib_exit();
+ btrfs_exit_compress();
}
module_init(init_btrfs_fs)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f50e931fc217..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
int ret;
+
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ return ERR_PTR(-EROFS);
again:
h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 to_reserve = 0;
u64 index = 0;
u64 objectid;
+ u64 root_flags;
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+ root_flags = btrfs_root_flags(new_root_item);
+ if (pending->readonly)
+ root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+ else
+ root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+ btrfs_set_root_flags(new_root_item, root_flags);
+
old = btrfs_lock_root_node(root);
btrfs_cow_block(trans, root, old, NULL, 0, &old);
btrfs_set_lock_blocking(old);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f104b57ad4ef..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv;
/* extra metadata reseration for relocation */
int error;
+ bool readonly;
struct list_head list;
};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6b9884507837..d158530233b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/iocontext.h>
+#include <linux/capability.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -493,7 +494,7 @@ again:
continue;
if (device->bdev) {
- close_bdev_exclusive(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
fs_devices->open_devices--;
}
@@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev) {
- close_bdev_exclusive(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
fs_devices->open_devices--;
}
if (device->writeable) {
@@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int seeding = 1;
int ret = 0;
+ flags |= FMODE_EXCL;
+
list_for_each_entry(device, head, dev_list) {
if (device->bdev)
continue;
if (!device->name)
continue;
- bdev = open_bdev_exclusive(device->name, flags, holder);
+ bdev = blkdev_get_by_path(device->name, flags, holder);
if (IS_ERR(bdev)) {
printk(KERN_INFO "open %s failed\n", device->name);
goto error;
@@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
- if (!bh)
+ if (!bh) {
+ ret = -EINVAL;
goto error_close;
+ }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
error_brelse:
brelse(bh);
error_close:
- close_bdev_exclusive(bdev, FMODE_READ);
+ blkdev_put(bdev, flags);
error:
continue;
}
@@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
mutex_lock(&uuid_mutex);
- bdev = open_bdev_exclusive(path, flags, holder);
+ flags |= FMODE_EXCL;
+ bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
@@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error_close;
bh = btrfs_read_dev_super(bdev);
if (!bh) {
- ret = -EIO;
+ ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
brelse(bh);
error_close:
- close_bdev_exclusive(bdev, flags);
+ blkdev_put(bdev, flags);
error:
mutex_unlock(&uuid_mutex);
return ret;
}
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_path *path;
+ u64 extent_end;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+
+ *length = 0;
+
+ if (start >= device->total_bytes)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ break;
+
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (key.offset <= start && extent_end > end) {
+ *length = end - start + 1;
+ break;
+ } else if (key.offset <= start && extent_end > start)
+ *length += extent_end - start;
+ else if (key.offset > start && extent_end <= end)
+ *length += extent_end - key.offset;
+ else if (key.offset > start && key.offset <= end) {
+ *length += end - key.offset + 1;
+ break;
+ } else if (key.offset > end)
+ break;
+
+next:
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans: transaction handler
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size of the max
+ * free space if we don't find suitable free space
+ *
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
*/
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *max_avail)
+ u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
- struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
- u64 hole_size = 0;
- u64 last_byte = 0;
- u64 search_start = 0;
+ u64 hole_size;
+ u64 max_hole_start;
+ u64 max_hole_size;
+ u64 extent_end;
+ u64 search_start;
u64 search_end = device->total_bytes;
int ret;
- int slot = 0;
- int start_found;
+ int slot;
struct extent_buffer *l;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = 2;
- start_found = 0;
-
/* FIXME use last free of some kind */
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
- search_start = max((u64)1024 * 1024, search_start);
+ search_start = 1024 * 1024;
- if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+ if (root->fs_info->alloc_start + num_bytes <= search_end)
search_start = max(root->fs_info->alloc_start, search_start);
+ max_hole_start = search_start;
+ max_hole_size = 0;
+
+ if (search_start >= search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ path->reada = 2;
+
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
if (ret < 0)
- goto error;
+ goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
- goto error;
- if (ret > 0)
- start_found = 1;
+ goto out;
}
- l = path->nodes[0];
- btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
while (1) {
l = path->nodes[0];
slot = path->slots[0];
@@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
if (ret == 0)
continue;
if (ret < 0)
- goto error;
-no_more_items:
- if (!start_found) {
- if (search_start >= search_end) {
- ret = -ENOSPC;
- goto error;
- }
- *start = search_start;
- start_found = 1;
- goto check_pending;
- }
- *start = last_byte > search_start ?
- last_byte : search_start;
- if (search_end <= *start) {
- ret = -ENOSPC;
- goto error;
- }
- goto check_pending;
+ goto out;
+
+ break;
}
btrfs_item_key_to_cpu(l, &key, slot);
@@ -812,48 +911,62 @@ no_more_items:
goto next;
if (key.objectid > device->devid)
- goto no_more_items;
+ break;
- if (key.offset >= search_start && key.offset > last_byte &&
- start_found) {
- if (last_byte < search_start)
- last_byte = search_start;
- hole_size = key.offset - last_byte;
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
- if (hole_size > *max_avail)
- *max_avail = hole_size;
+ if (key.offset > search_start) {
+ hole_size = key.offset - search_start;
- if (key.offset > last_byte &&
- hole_size >= num_bytes) {
- *start = last_byte;
- goto check_pending;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
+
+ /*
+ * If this free space is greater than which we need,
+ * it must be the max free space that we have found
+ * until now, so max_hole_start must point to the start
+ * of this free space and the length of this free space
+ * is stored in max_hole_size. Thus, we return
+ * max_hole_start and max_hole_size and go back to the
+ * caller.
+ */
+ if (hole_size >= num_bytes) {
+ ret = 0;
+ goto out;
}
}
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
- goto next;
- start_found = 1;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
- last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (extent_end > search_start)
+ search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
-check_pending:
- /* we have to make sure we didn't find an extent that has already
- * been allocated by the map tree or the original allocation
- */
- BUG_ON(*start < search_start);
- if (*start + num_bytes > search_end) {
- ret = -ENOSPC;
- goto error;
+ hole_size = search_end- search_start;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
}
- /* check for pending inserts here */
- ret = 0;
-error:
+ /* See above. */
+ if (hole_size < num_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+
+out:
btrfs_free_path(path);
+error:
+ *start = max_hole_start;
+ if (len)
+ *len = max_hole_size;
return ret;
}
@@ -1183,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto out;
}
} else {
- bdev = open_bdev_exclusive(device_path, FMODE_READ,
- root->fs_info->bdev_holder);
+ bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+ root->fs_info->bdev_holder);
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto out;
@@ -1193,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
if (!bh) {
- ret = -EIO;
+ ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1251,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
if (device->bdev) {
- close_bdev_exclusive(device->bdev, device->mode);
+ blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
device->fs_devices->open_devices--;
}
@@ -1294,7 +1407,7 @@ error_brelse:
brelse(bh);
error_close:
if (bdev)
- close_bdev_exclusive(bdev, FMODE_READ);
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
@@ -1446,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EINVAL;
- bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+ bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+ root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
@@ -1572,7 +1686,7 @@ out:
mutex_unlock(&root->fs_info->volume_mutex);
return ret;
error:
- close_bdev_exclusive(bdev, 0);
+ blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -1912,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
mutex_lock(&dev_root->fs_info->volume_mutex);
dev_root = dev_root->fs_info->dev_root;
@@ -2150,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
return calc_size * num_stripes;
}
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes, u64 *stripe_size,
- u64 start, u64 type)
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct list_head *cur;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct list_head private_devs;
- int min_stripe_size = 1 * 1024 * 1024;
- u64 calc_size = 1024 * 1024 * 1024;
- u64 max_chunk_size = calc_size;
- u64 min_free;
- u64 avail;
- u64 max_avail = 0;
- u64 dev_offset;
- int num_stripes = 1;
- int min_stripes = 1;
- int sub_stripes = 0;
- int looped = 0;
- int ret;
- int index;
- int stripe_len = 64 * 1024;
+ if (((struct btrfs_device_info *)dev_info1)->max_avail >
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return -1;
+ else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return 1;
+ else
+ return 0;
+}
- if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
- (type & BTRFS_BLOCK_GROUP_DUP)) {
- WARN_ON(1);
- type &= ~BTRFS_BLOCK_GROUP_DUP;
- }
- if (list_empty(&fs_devices->alloc_list))
- return -ENOSPC;
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+ int *num_stripes, int *min_stripes,
+ int *sub_stripes)
+{
+ *num_stripes = 1;
+ *min_stripes = 1;
+ *sub_stripes = 0;
if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- num_stripes = fs_devices->rw_devices;
- min_stripes = 2;
+ *num_stripes = fs_devices->rw_devices;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = 2;
- min_stripes = 2;
+ *num_stripes = 2;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
if (fs_devices->rw_devices < 2)
return -ENOSPC;
- num_stripes = 2;
- min_stripes = 2;
+ *num_stripes = 2;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- num_stripes = fs_devices->rw_devices;
- if (num_stripes < 4)
+ *num_stripes = fs_devices->rw_devices;
+ if (*num_stripes < 4)
return -ENOSPC;
- num_stripes &= ~(u32)1;
- sub_stripes = 2;
- min_stripes = 4;
+ *num_stripes &= ~(u32)1;
+ *sub_stripes = 2;
+ *min_stripes = 4;
}
+ return 0;
+}
+
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+ u64 proposed_size, u64 type,
+ int num_stripes, int small_stripe)
+{
+ int min_stripe_size = 1 * 1024 * 1024;
+ u64 calc_size = proposed_size;
+ u64 max_chunk_size = calc_size;
+ int ncopies = 1;
+
+ if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID10))
+ ncopies = 2;
+
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_chunk_size = 10 * calc_size;
min_stripe_size = 64 * 1024 * 1024;
@@ -2226,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
-again:
- max_avail = 0;
- if (!map || map->num_stripes != num_stripes) {
- kfree(map);
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map)
- return -ENOMEM;
- map->num_stripes = num_stripes;
- }
-
- if (calc_size * num_stripes > max_chunk_size) {
- calc_size = max_chunk_size;
+ if (calc_size * num_stripes > max_chunk_size * ncopies) {
+ calc_size = max_chunk_size * ncopies;
do_div(calc_size, num_stripes);
- do_div(calc_size, stripe_len);
- calc_size *= stripe_len;
+ do_div(calc_size, BTRFS_STRIPE_LEN);
+ calc_size *= BTRFS_STRIPE_LEN;
}
/* we don't want tiny stripes */
- if (!looped)
+ if (!small_stripe)
calc_size = max_t(u64, min_stripe_size, calc_size);
/*
- * we're about to do_div by the stripe_len so lets make sure
+ * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
* we end up with something bigger than a stripe
*/
- calc_size = max_t(u64, calc_size, stripe_len * 4);
+ calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+
+ do_div(calc_size, BTRFS_STRIPE_LEN);
+ calc_size *= BTRFS_STRIPE_LEN;
+
+ return calc_size;
+}
+
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+ int num_stripes)
+{
+ struct map_lookup *new;
+ size_t len = map_lookup_size(num_stripes);
+
+ BUG_ON(map->num_stripes < num_stripes);
+
+ if (map->num_stripes == num_stripes)
+ return map;
+
+ new = kmalloc(len, GFP_NOFS);
+ if (!new) {
+ /* just change map->num_stripes */
+ map->num_stripes = num_stripes;
+ return map;
+ }
+
+ memcpy(new, map, len);
+ new->num_stripes = num_stripes;
+ kfree(map);
+ return new;
+}
+
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device_info *devices,
+ int nr_device, u64 type,
+ struct map_lookup **map_lookup,
+ int min_stripes, u64 *stripe_size)
+{
+ int i, index, sort_again = 0;
+ int min_devices = min_stripes;
+ u64 max_avail, min_free;
+ struct map_lookup *map = *map_lookup;
+ int ret;
+
+ if (nr_device < min_stripes)
+ return -ENOSPC;
+
+ btrfs_descending_sort_devices(devices, nr_device);
- do_div(calc_size, stripe_len);
- calc_size *= stripe_len;
+ max_avail = devices[0].max_avail;
+ if (!max_avail)
+ return -ENOSPC;
+
+ for (i = 0; i < nr_device; i++) {
+ /*
+ * if dev_offset = 0, it means the free space of this device
+ * is less than what we need, and we didn't search max avail
+ * extent on this device, so do it now.
+ */
+ if (!devices[i].dev_offset) {
+ ret = find_free_dev_extent(trans, devices[i].dev,
+ max_avail,
+ &devices[i].dev_offset,
+ &devices[i].max_avail);
+ if (ret != 0 && ret != -ENOSPC)
+ return ret;
+ sort_again = 1;
+ }
+ }
+
+ /* we update the max avail free extent of each devices, sort again */
+ if (sort_again)
+ btrfs_descending_sort_devices(devices, nr_device);
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_devices = 1;
+
+ if (!devices[min_devices - 1].max_avail)
+ return -ENOSPC;
+
+ max_avail = devices[min_devices - 1].max_avail;
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ do_div(max_avail, 2);
+
+ max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+ min_stripes, 1);
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_free = max_avail * 2;
+ else
+ min_free = max_avail;
+
+ if (min_free > devices[min_devices - 1].max_avail)
+ return -ENOSPC;
+
+ map = __shrink_map_lookup_stripes(map, min_stripes);
+ *stripe_size = max_avail;
+
+ index = 0;
+ for (i = 0; i < min_stripes; i++) {
+ map->stripes[i].dev = devices[index].dev;
+ map->stripes[i].physical = devices[index].dev_offset;
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
+ i++;
+ map->stripes[i].dev = devices[index].dev;
+ map->stripes[i].physical = devices[index].dev_offset +
+ max_avail;
+ }
+ index++;
+ }
+ *map_lookup = map;
+
+ return 0;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup **map_ret,
+ u64 *num_bytes, u64 *stripe_size,
+ u64 start, u64 type)
+{
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_device *device = NULL;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_device_info *devices_info;
+ struct list_head private_devs;
+ u64 calc_size = 1024 * 1024 * 1024;
+ u64 min_free;
+ u64 avail;
+ u64 dev_offset;
+ int num_stripes;
+ int min_stripes;
+ int sub_stripes;
+ int min_devices; /* the min number of devices we need */
+ int i;
+ int ret;
+ int index;
+
+ if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+ (type & BTRFS_BLOCK_GROUP_DUP)) {
+ WARN_ON(1);
+ type &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
+
+ ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+ &min_stripes, &sub_stripes);
+ if (ret)
+ return ret;
+
+ devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ map->num_stripes = num_stripes;
cur = fs_devices->alloc_list.next;
index = 0;
+ i = 0;
- if (type & BTRFS_BLOCK_GROUP_DUP)
+ calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+ num_stripes, 0);
+
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
min_free = calc_size * 2;
- else
+ min_devices = 1;
+ } else {
min_free = calc_size;
-
- /*
- * we add 1MB because we never use the first 1MB of the device, unless
- * we've looped, then we are likely allocating the maximum amount of
- * space left already
- */
- if (!looped)
- min_free += 1024 * 1024;
+ min_devices = min_stripes;
+ }
INIT_LIST_HEAD(&private_devs);
while (index < num_stripes) {
@@ -2283,27 +2559,39 @@ again:
cur = cur->next;
if (device->in_fs_metadata && avail >= min_free) {
- ret = find_free_dev_extent(trans, device,
- min_free, &dev_offset,
- &max_avail);
+ ret = find_free_dev_extent(trans, device, min_free,
+ &devices_info[i].dev_offset,
+ &devices_info[i].max_avail);
if (ret == 0) {
list_move_tail(&device->dev_alloc_list,
&private_devs);
map->stripes[index].dev = device;
- map->stripes[index].physical = dev_offset;
+ map->stripes[index].physical =
+ devices_info[i].dev_offset;
index++;
if (type & BTRFS_BLOCK_GROUP_DUP) {
map->stripes[index].dev = device;
map->stripes[index].physical =
- dev_offset + calc_size;
+ devices_info[i].dev_offset +
+ calc_size;
index++;
}
- }
- } else if (device->in_fs_metadata && avail > max_avail)
- max_avail = avail;
+ } else if (ret != -ENOSPC)
+ goto error;
+
+ devices_info[i].dev = device;
+ i++;
+ } else if (device->in_fs_metadata &&
+ avail >= BTRFS_STRIPE_LEN) {
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail;
+ i++;
+ }
+
if (cur == &fs_devices->alloc_list)
break;
}
+
list_splice(&private_devs, &fs_devices->alloc_list);
if (index < num_stripes) {
if (index >= min_stripes) {
@@ -2312,34 +2600,36 @@ again:
num_stripes /= sub_stripes;
num_stripes *= sub_stripes;
}
- looped = 1;
- goto again;
- }
- if (!looped && max_avail > 0) {
- looped = 1;
- calc_size = max_avail;
- goto again;
+
+ map = __shrink_map_lookup_stripes(map, num_stripes);
+ } else if (i >= min_devices) {
+ ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+ devices_info, i, type,
+ &map, min_stripes,
+ &calc_size);
+ if (ret)
+ goto error;
+ } else {
+ ret = -ENOSPC;
+ goto error;
}
- kfree(map);
- return -ENOSPC;
}
map->sector_size = extent_root->sectorsize;
- map->stripe_len = stripe_len;
- map->io_align = stripe_len;
- map->io_width = stripe_len;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
- map->num_stripes = num_stripes;
map->sub_stripes = sub_stripes;
*map_ret = map;
*stripe_size = calc_size;
*num_bytes = chunk_bytes_by_type(type, calc_size,
- num_stripes, sub_stripes);
+ map->num_stripes, sub_stripes);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
- kfree(map);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
em->bdev = (struct block_device *)map;
em->start = start;
@@ -2372,7 +2662,13 @@ again:
index++;
}
+ kfree(devices_info);
return 0;
+
+error:
+ kfree(map);
+ kfree(devices_info);
+ return ret;
}
static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2740db49eb04..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
#define __BTRFS_VOLUMES_
#include <linux/bio.h>
+#include <linux/sort.h>
#include "async-thread.h"
+#define BTRFS_STRIPE_LEN (64 * 1024)
+
struct buffer_head;
struct btrfs_pending_bios {
struct bio *head;
@@ -50,7 +53,7 @@ struct btrfs_device {
struct block_device *bdev;
- /* the mode sent to open_bdev_exclusive */
+ /* the mode sent to blkdev_get */
fmode_t mode;
char *name;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
struct btrfs_bio_stripe stripes[];
};
+struct btrfs_device_info {
+ struct btrfs_device *dev;
+ u64 dev_offset;
+ u64 max_avail;
+};
+
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+ struct btrfs_device_info *devices,
+ size_t nr_devices)
+{
+ sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_free_bytes, NULL);
+}
+
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length);
+
#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739c..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
+ struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+ /*
+ * The permission on security.* and system.* is not checked
+ * in permission().
+ */
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
+ struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+ /*
+ * The permission on security.* and system.* is not checked
+ * in permission().
+ */
+ if (btrfs_root_readonly(root))
+ return -EROFS;
+
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b9cd5445f71c..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
#include <linux/bio.h>
#include "compression.h"
-/* Plan: call deflate() with avail_in == *sourcelen,
- avail_out = *dstlen - 12 and flush == Z_FINISH.
- If it doesn't manage to finish, call it again with
- avail_in == 0 and avail_out set to the remaining 12
- bytes for it to clean up.
- Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
-
struct workspace {
z_stream inf_strm;
z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
struct list_head list;
};
-static LIST_HEAD(idle_workspace);
-static DEFINE_SPINLOCK(workspace_lock);
-static unsigned long num_workspace;
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+static void zlib_free_workspace(struct list_head *ws)
+{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
-/*
- * this finds an available zlib workspace or allocates a new one
- * NULL or an ERR_PTR is returned if things go bad.
- */
-static struct workspace *find_zlib_workspace(void)
+ vfree(workspace->def_strm.workspace);
+ vfree(workspace->inf_strm.workspace);
+ kfree(workspace->buf);
+ kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
{
struct workspace *workspace;
- int ret;
- int cpus = num_online_cpus();
-
-again:
- spin_lock(&workspace_lock);
- if (!list_empty(&idle_workspace)) {
- workspace = list_entry(idle_workspace.next, struct workspace,
- list);
- list_del(&workspace->list);
- num_workspace--;
- spin_unlock(&workspace_lock);
- return workspace;
- }
- spin_unlock(&workspace_lock);
- if (atomic_read(&alloc_workspace) > cpus) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(&alloc_workspace) > cpus)
- schedule();
- finish_wait(&workspace_wait, &wait);
- goto again;
- }
- atomic_inc(&alloc_workspace);
workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
- if (!workspace) {
- ret = -ENOMEM;
- goto fail;
- }
+ if (!workspace)
+ return ERR_PTR(-ENOMEM);
workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
- if (!workspace->def_strm.workspace) {
- ret = -ENOMEM;
- goto fail;
- }
workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
- if (!workspace->inf_strm.workspace) {
- ret = -ENOMEM;
- goto fail_inflate;
- }
workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- if (!workspace->buf) {
- ret = -ENOMEM;
- goto fail_kmalloc;
- }
- return workspace;
-
-fail_kmalloc:
- vfree(workspace->inf_strm.workspace);
-fail_inflate:
- vfree(workspace->def_strm.workspace);
-fail:
- kfree(workspace);
- atomic_dec(&alloc_workspace);
- wake_up(&workspace_wait);
- return ERR_PTR(ret);
-}
-
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
- spin_lock(&workspace_lock);
- if (num_workspace < num_online_cpus()) {
- list_add_tail(&workspace->list, &idle_workspace);
- num_workspace++;
- spin_unlock(&workspace_lock);
- if (waitqueue_active(&workspace_wait))
- wake_up(&workspace_wait);
- return 0;
- }
- spin_unlock(&workspace_lock);
- vfree(workspace->def_strm.workspace);
- vfree(workspace->inf_strm.workspace);
- kfree(workspace->buf);
- kfree(workspace);
+ if (!workspace->def_strm.workspace ||
+ !workspace->inf_strm.workspace || !workspace->buf)
+ goto fail;
- atomic_dec(&alloc_workspace);
- if (waitqueue_active(&workspace_wait))
- wake_up(&workspace_wait);
- return 0;
-}
+ INIT_LIST_HEAD(&workspace->list);
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
- struct workspace *workspace;
- while (!list_empty(&idle_workspace)) {
- workspace = list_entry(idle_workspace.next, struct workspace,
- list);
- list_del(&workspace->list);
- vfree(workspace->def_strm.workspace);
- vfree(workspace->inf_strm.workspace);
- kfree(workspace->buf);
- kfree(workspace);
- atomic_dec(&alloc_workspace);
- }
+ return &workspace->list;
+fail:
+ zlib_free_workspace(&workspace->list);
+ return ERR_PTR(-ENOMEM);
}
-/*
- * given an address space and start/len, compress the bytes.
- *
- * pages are allocated to hold the compressed result and stored
- * in 'pages'
- *
- * out_pages is used to return the number of pages allocated. There
- * may be pages allocated even if we return an error
- *
- * total_in is used to return the number of bytes actually read. It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
- u64 start, unsigned long len,
- struct page **pages,
- unsigned long nr_dest_pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out,
- unsigned long max_out)
+static int zlib_compress_pages(struct list_head *ws,
+ struct address_space *mapping,
+ u64 start, unsigned long len,
+ struct page **pages,
+ unsigned long nr_dest_pages,
+ unsigned long *out_pages,
+ unsigned long *total_in,
+ unsigned long *total_out,
+ unsigned long max_out)
{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
- struct workspace *workspace;
char *data_in;
char *cpage_out;
int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
*total_out = 0;
*total_in = 0;
- workspace = find_zlib_workspace();
- if (IS_ERR(workspace))
- return -1;
-
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
printk(KERN_WARNING "deflateInit failed\n");
ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
data_in = kmap(in_page);
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -1;
+ goto out;
+ }
cpage_out = kmap(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
goto out;
}
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -1;
+ goto out;
+ }
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
@@ -314,55 +208,26 @@ out:
kunmap(in_page);
page_cache_release(in_page);
}
- free_workspace(workspace);
return ret;
}
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous. They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
- u64 disk_start,
- struct bio_vec *bvec,
- int vcnt,
- size_t srclen)
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+ u64 disk_start,
+ struct bio_vec *bvec,
+ int vcnt,
+ size_t srclen)
{
- int ret = 0;
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
+ int ret = 0, ret2;
int wbits = MAX_WBITS;
- struct workspace *workspace;
char *data_in;
size_t total_out = 0;
- unsigned long page_bytes_left;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- struct page *page_out;
unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE;
unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
- unsigned long working_bytes;
unsigned long pg_offset;
- unsigned long start_byte;
- unsigned long current_buf_start;
- char *kaddr;
-
- workspace = find_zlib_workspace();
- if (IS_ERR(workspace))
- return -ENOMEM;
data_in = kmap(pages_in[page_in_index]);
workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
workspace->inf_strm.total_out = 0;
workspace->inf_strm.next_out = workspace->buf;
workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
- page_out = bvec[page_out_index].bv_page;
- page_bytes_left = PAGE_CACHE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "inflateInit failed\n");
- ret = -1;
- goto out;
+ return -1;
}
while (workspace->inf_strm.total_in < srclen) {
ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
if (ret != Z_OK && ret != Z_STREAM_END)
break;
- /*
- * buf start is the byte offset we're of the start of
- * our workspace buffer
- */
- buf_start = total_out;
- /* total_out is the last byte of the workspace buffer */
+ buf_start = total_out;
total_out = workspace->inf_strm.total_out;
- working_bytes = total_out - buf_start;
-
- /*
- * start byte is the first byte of the page we're currently
- * copying into relative to the start of the compressed data.
- */
- start_byte = page_offset(page_out) - disk_start;
-
- if (working_bytes == 0) {
- /* we didn't make progress in this inflate
- * call, we're done
- */
- if (ret != Z_STREAM_END)
- ret = -1;
+ /* we didn't make progress in this inflate call, we're done */
+ if (buf_start == total_out)
break;
- }
- /* we haven't yet hit data corresponding to this page */
- if (total_out <= start_byte)
- goto next;
-
- /*
- * the start of the data we care about is offset into
- * the middle of our working buffer
- */
- if (total_out > start_byte && buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes -= buf_offset;
- } else {
- buf_offset = 0;
- }
- current_buf_start = buf_start;
-
- /* copy bytes from the working buffer into the pages */
- while (working_bytes > 0) {
- bytes = min(PAGE_CACHE_SIZE - pg_offset,
- PAGE_CACHE_SIZE - buf_offset);
- bytes = min(bytes, working_bytes);
- kaddr = kmap_atomic(page_out, KM_USER0);
- memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
- bytes);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(page_out);
-
- pg_offset += bytes;
- page_bytes_left -= bytes;
- buf_offset += bytes;
- working_bytes -= bytes;
- current_buf_start += bytes;
-
- /* check if we need to pick another page */
- if (page_bytes_left == 0) {
- page_out_index++;
- if (page_out_index >= vcnt) {
- ret = 0;
- goto done;
- }
-
- page_out = bvec[page_out_index].bv_page;
- pg_offset = 0;
- page_bytes_left = PAGE_CACHE_SIZE;
- start_byte = page_offset(page_out) - disk_start;
-
- /*
- * make sure our new page is covered by this
- * working buffer
- */
- if (total_out <= start_byte)
- goto next;
-
- /* the next page in the biovec might not
- * be adjacent to the last page, but it
- * might still be found inside this working
- * buffer. bump our offset pointer
- */
- if (total_out > start_byte &&
- current_buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes = total_out - start_byte;
- current_buf_start = buf_start +
- buf_offset;
- }
- }
+ ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+ total_out, disk_start,
+ bvec, vcnt,
+ &page_out_index, &pg_offset);
+ if (ret2 == 0) {
+ ret = 0;
+ goto done;
}
-next:
+
workspace->inf_strm.next_out = workspace->buf;
workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
@@ -516,35 +301,21 @@ done:
zlib_inflateEnd(&workspace->inf_strm);
if (data_in)
kunmap(pages_in[page_in_index]);
-out:
- free_workspace(workspace);
return ret;
}
-/*
- * a less complex decompression routine. Our compressed data fits in a
- * single page, and we want to read a single page out of it.
- * start_byte tells us the offset into the compressed data we're interested in
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page,
+ unsigned long start_byte,
+ size_t srclen, size_t destlen)
{
+ struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- struct workspace *workspace;
unsigned long bytes_left = destlen;
unsigned long total_out = 0;
char *kaddr;
- if (destlen > PAGE_CACHE_SIZE)
- return -ENOMEM;
-
- workspace = find_zlib_workspace();
- if (IS_ERR(workspace))
- return -ENOMEM;
-
workspace->inf_strm.next_in = data_in;
workspace->inf_strm.avail_in = srclen;
workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "inflateInit failed\n");
- ret = -1;
- goto out;
+ return -1;
}
while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
ret = 0;
zlib_inflateEnd(&workspace->inf_strm);
-out:
- free_workspace(workspace);
return ret;
}
-void btrfs_zlib_exit(void)
-{
- free_workspaces();
-}
+struct btrfs_compress_op btrfs_zlib_compress = {
+ .alloc_workspace = zlib_alloc_workspace,
+ .free_workspace = zlib_free_workspace,
+ .compress_pages = zlib_compress_pages,
+ .decompress_biovec = zlib_decompress_biovec,
+ .decompress = zlib_decompress,
+};
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff1..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
# Makefile for CEPH filesystem.
#
-ifneq ($(KERNELRELEASE),)
-
obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-
-default: all
-
-all:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-
-modules_install:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-
-clean:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-
-endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b58..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
req = rb_entry(rp, struct ceph_mds_request, r_node);
- if (req->r_request)
- seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
- else
+ if (req->r_request && req->r_session)
+ seq_printf(s, "%lld\tmds%d\t", req->r_tid,
+ req->r_session->s_mds);
+ else if (!req->r_request)
seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+ else
+ seq_printf(s, "%lld\t(no session)\t", req->r_tid);
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fa7ca04ee816..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1224,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
}
}
+/*
+ * Return name hash for a given dentry. This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+ struct inode *dir = dn->d_parent->d_inode;
+ struct ceph_inode_info *dci = ceph_inode(dir);
+
+ switch (dci->i_dir_layout.dl_dir_hash) {
+ case 0: /* for backward compat */
+ case CEPH_STR_HASH_LINUX:
+ return dn->d_name.hash;
+
+ default:
+ return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+ dn->d_name.name, dn->d_name.len);
+ }
+}
+
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
.readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d9426992..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
dout("encode_fh %p connectable\n", dentry);
cfh->ino = ceph_ino(dentry->d_inode);
cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = parent->d_name.hash;
+ cfh->parent_name_hash = ceph_dentry_hash(parent);
*max_len = connected_handle_length;
type = 2;
} else if (*max_len >= handle_length) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e61de4f7b99d..e835eff551e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_release_count = 0;
ci->i_symlink = NULL;
+ memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -689,6 +691,8 @@ static int fill_inode(struct inode *inode,
inode->i_op = &ceph_dir_iops;
inode->i_fop = &ceph_dir_fops;
+ ci->i_dir_layout = iinfo->dir_layout;
+
ci->i_files = le64_to_cpu(info->files);
ci->i_subdirs = le64_to_cpu(info->subdirs);
ci->i_rbytes = le64_to_cpu(info->rbytes);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50fca1e03be..1e30d194a8e3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
* parse individual inode info
*/
static int parse_reply_info_in(void **p, void *end,
- struct ceph_mds_reply_info_in *info)
+ struct ceph_mds_reply_info_in *info,
+ int features)
{
int err = -EIO;
@@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
info->symlink = *p;
*p += info->symlink_len;
+ if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+ ceph_decode_copy_safe(p, end, &info->dir_layout,
+ sizeof(info->dir_layout), bad);
+ else
+ memset(&info->dir_layout, 0, sizeof(info->dir_layout));
+
ceph_decode_32_safe(p, end, info->xattr_len, bad);
ceph_decode_need(p, end, info->xattr_len, bad);
info->xattr_data = *p;
@@ -88,12 +95,13 @@ bad:
* target inode.
*/
static int parse_reply_info_trace(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
int err;
if (info->head->is_dentry) {
- err = parse_reply_info_in(p, end, &info->diri);
+ err = parse_reply_info_in(p, end, &info->diri, features);
if (err < 0)
goto out_bad;
@@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
}
if (info->head->is_target) {
- err = parse_reply_info_in(p, end, &info->targeti);
+ err = parse_reply_info_in(p, end, &info->targeti, features);
if (err < 0)
goto out_bad;
}
@@ -134,7 +142,8 @@ out_bad:
* parse readdir results
*/
static int parse_reply_info_dir(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
u32 num, i = 0;
int err;
@@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
*p += sizeof(struct ceph_mds_reply_lease);
/* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i]);
+ err = parse_reply_info_in(p, end, &info->dir_in[i], features);
if (err < 0)
goto out_bad;
i++;
@@ -205,7 +214,8 @@ out_bad:
* parse fcntl F_GETLK results
*/
static int parse_reply_info_filelock(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
if (*p + sizeof(*info->filelock_reply) > end)
goto bad;
@@ -225,19 +235,21 @@ bad:
* parse extra results
*/
static int parse_reply_info_extra(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
- return parse_reply_info_filelock(p, end, info);
+ return parse_reply_info_filelock(p, end, info, features);
else
- return parse_reply_info_dir(p, end, info);
+ return parse_reply_info_dir(p, end, info, features);
}
/*
* parse entire mds reply
*/
static int parse_reply_info(struct ceph_msg *msg,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
void *p, *end;
u32 len;
@@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* trace */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
- err = parse_reply_info_trace(&p, p+len, info);
+ err = parse_reply_info_trace(&p, p+len, info, features);
if (err < 0)
goto out_bad;
}
@@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* extra */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
- err = parse_reply_info_extra(&p, p+len, info);
+ err = parse_reply_info_extra(&p, p+len, info, features);
if (err < 0)
goto out_bad;
}
@@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
} else {
/* dir + name */
inode = dir;
- hash = req->r_dentry->d_name.hash;
+ hash = ceph_dentry_hash(req->r_dentry);
is_hash = true;
}
}
@@ -1693,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
struct ceph_msg *msg;
int flags = 0;
- req->r_mds = mds;
req->r_attempts++;
if (req->r_inode) {
struct ceph_cap *cap =
@@ -1780,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
+ put_request_session(req);
+
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1797,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
}
+ req->r_session = get_session(session);
+
dout("do_request mds%d session %p state %s\n", mds, session,
session_state_name(session->s_state));
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1809,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
}
/* send request */
- req->r_session = get_session(session);
req->r_resend_mds = -1; /* forget any previous mds hint */
if (req->r_request_started == 0) /* note request start time */
@@ -1863,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
if (req->r_session &&
req->r_session->s_mds == mds) {
dout(" kicking tid %llu\n", req->r_tid);
- put_request_session(req);
__do_request(mdsc, req);
}
}
@@ -2056,8 +2069,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
goto out;
} else {
struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- struct ceph_cap *cap =
- ceph_get_cap_for_mds(ci, req->r_mds);;
+ struct ceph_cap *cap = NULL;
+
+ if (req->r_session)
+ cap = ceph_get_cap_for_mds(ci,
+ req->r_session->s_mds);
dout("already using auth");
if ((!cap || cap != ci->i_auth_cap) ||
@@ -2101,7 +2117,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo);
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index aabe563b54db..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
*/
struct ceph_mds_reply_info_in {
struct ceph_mds_reply_inode *in;
+ struct ceph_dir_layout dir_layout;
u32 symlink_len;
char *symlink;
u32 xattr_len;
@@ -165,7 +166,6 @@ struct ceph_mds_request {
struct ceph_mds_client *r_mdsc;
int r_op; /* mds op code */
- int r_mds;
/* operation on what? */
struct inode *r_inode; /* arg1 */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae0539..bf6f0f34082a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+ fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+ CEPH_FEATURE_DIRLAYOUTHASH;
fsc->client->monc.want_mdsmap = 1;
fsc->mount_options = fsopt;
@@ -443,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail_client;
err = -ENOMEM;
- fsc->wb_wq = create_workqueue("ceph-writeback");
+ /*
+ * The number of concurrent works can be high but they don't need
+ * to be processed in parallel, limit concurrency.
+ */
+ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
goto fail_bdi;
- fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+ fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
- fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+ fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
if (fsc->trunc_wq == NULL)
goto fail_pg_inv_wq;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4553d8829edb..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
unsigned i_ceph_flags;
unsigned long i_release_count;
+ struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
char *i_symlink;
@@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
/*
* our d_ops vary depending on whether the inode is live,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index e5b9df993b93..dca9e5e0f73b 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,7 +59,7 @@ static struct char_device_struct {
} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
/* index in the above */
-static inline int major_to_index(int major)
+static inline int major_to_index(unsigned major)
{
return major % CHRDEV_MAJOR_HASH_SIZE;
}
@@ -417,18 +417,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
return ret;
}
-int cdev_index(struct inode *inode)
-{
- int idx;
- struct kobject *kobj;
-
- kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
- if (!kobj)
- return -1;
- kobject_put(kobj);
- return idx;
-}
-
void cd_forget(struct inode *inode)
{
spin_lock(&cdev_lock);
@@ -582,7 +570,6 @@ EXPORT_SYMBOL(cdev_init);
EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
-EXPORT_SYMBOL(cdev_index);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);
EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ede98300a8cd..65829d32128c 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,11 +79,11 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each(tmp, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+ cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %d",
mid_entry->midState,
(int)mid_entry->command,
mid_entry->pid,
- mid_entry->tsk,
+ mid_entry->callback_data,
mid_entry->mid);
#ifdef CONFIG_CIFS_STATS2
cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
@@ -218,11 +218,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
mid_entry = list_entry(tmp3, struct mid_q_entry,
qhead);
seq_printf(m, "\tState: %d com: %d pid:"
- " %d tsk: %p mid %d\n",
+ " %d cbdata: %p mid %d\n",
mid_entry->midState,
(int)mid_entry->command,
mid_entry->pid,
- mid_entry->tsk,
+ mid_entry->callback_data,
mid_entry->mid);
}
spin_unlock(&GlobalMid_Lock);
@@ -331,7 +331,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
atomic_read(&totSmBufAllocCount));
#endif /* CONFIG_CIFS_STATS2 */
- seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+ seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
seq_printf(m,
"\n%d session %d share reconnects\n",
tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c68a056f27fd..7ed36536e754 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -255,35 +255,6 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
}
-static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
- struct list_head *mntlist)
-{
- /* stolen from afs code */
- int err;
-
- mntget(newmnt);
- err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist);
- switch (err) {
- case 0:
- path_put(&nd->path);
- nd->path.mnt = newmnt;
- nd->path.dentry = dget(newmnt->mnt_root);
- schedule_delayed_work(&cifs_dfs_automount_task,
- cifs_dfs_mountpoint_expiry_timeout);
- break;
- case -EBUSY:
- /* someone else made a mount here whilst we were busy */
- while (d_mountpoint(nd->path.dentry) &&
- follow_down(&nd->path))
- ;
- err = 0;
- default:
- mntput(newmnt);
- break;
- }
- return err;
-}
-
static void dump_referral(const struct dfs_info3_param *ref)
{
cFYI(1, "DFS: ref path: %s", ref->path_name);
@@ -293,45 +264,43 @@ static void dump_referral(const struct dfs_info3_param *ref)
ref->path_consumed);
}
-
-static void*
-cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+/*
+ * Create a vfsmount that we can automount
+ */
+static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
{
struct dfs_info3_param *referrals = NULL;
unsigned int num_referrals = 0;
struct cifs_sb_info *cifs_sb;
struct cifsSesInfo *ses;
- char *full_path = NULL;
+ char *full_path;
int xid, i;
- int rc = 0;
- struct vfsmount *mnt = ERR_PTR(-ENOENT);
+ int rc;
+ struct vfsmount *mnt;
struct tcon_link *tlink;
cFYI(1, "in %s", __func__);
- BUG_ON(IS_ROOT(dentry));
+ BUG_ON(IS_ROOT(mntpt));
xid = GetXid();
- dput(nd->path.dentry);
- nd->path.dentry = dget(dentry);
-
/*
* The MSDFS spec states that paths in DFS referral requests and
* responses must be prefixed by a single '\' character instead of
* the double backslashes usually used in the UNC. This function
* gives us the latter, so we must adjust the result.
*/
- full_path = build_path_from_dentry(dentry);
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto out_err;
- }
+ mnt = ERR_PTR(-ENOMEM);
+ full_path = build_path_from_dentry(mntpt);
+ if (full_path == NULL)
+ goto free_xid;
- cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
+ cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
tlink = cifs_sb_tlink(cifs_sb);
+ mnt = ERR_PTR(-EINVAL);
if (IS_ERR(tlink)) {
- rc = PTR_ERR(tlink);
- goto out_err;
+ mnt = ERR_CAST(tlink);
+ goto free_full_path;
}
ses = tlink_tcon(tlink)->ses;
@@ -341,46 +310,63 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
cifs_put_tlink(tlink);
+ mnt = ERR_PTR(-ENOENT);
for (i = 0; i < num_referrals; i++) {
int len;
- dump_referral(referrals+i);
+ dump_referral(referrals + i);
/* connect to a node */
len = strlen(referrals[i].node_name);
if (len < 2) {
cERROR(1, "%s: Net Address path too short: %s",
__func__, referrals[i].node_name);
- rc = -EINVAL;
- goto out_err;
+ mnt = ERR_PTR(-EINVAL);
+ break;
}
mnt = cifs_dfs_do_refmount(cifs_sb,
full_path, referrals + i);
cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
referrals[i].node_name, mnt);
-
- /* complete mount procedure if we accured submount */
if (!IS_ERR(mnt))
- break;
+ goto success;
}
- /* we need it cause for() above could exit without valid submount */
- rc = PTR_ERR(mnt);
- if (IS_ERR(mnt))
- goto out_err;
-
- rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list);
+ /* no valid submounts were found; return error from get_dfs_path() by
+ * preference */
+ if (rc != 0)
+ mnt = ERR_PTR(rc);
-out:
- FreeXid(xid);
+success:
free_dfs_info_array(referrals, num_referrals);
+free_full_path:
kfree(full_path);
+free_xid:
+ FreeXid(xid);
cFYI(1, "leaving %s" , __func__);
- return ERR_PTR(rc);
-out_err:
- path_put(&nd->path);
- goto out;
+ return mnt;
+}
+
+/*
+ * Attempt to automount the referral
+ */
+struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+ struct vfsmount *newmnt;
+
+ cFYI(1, "in %s", __func__);
+
+ newmnt = cifs_dfs_do_automount(path->dentry);
+ if (IS_ERR(newmnt)) {
+ cFYI(1, "leaving %s [automount failed]" , __func__);
+ return newmnt;
+ }
+
+ mntget(newmnt); /* prevent immediate expiration */
+ mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
+ schedule_delayed_work(&cifs_dfs_automount_task,
+ cifs_dfs_mountpoint_expiry_timeout);
+ cFYI(1, "leaving %s [ok]" , __func__);
+ return newmnt;
}
const struct inode_operations cifs_dfs_referral_inode_operations = {
- .follow_link = cifs_dfs_follow_mountpoint,
};
-
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index a437ec391a01..1e7636b145a8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -41,9 +41,12 @@ static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
;
-/* security id for everyone */
+/* security id for everyone/world system group */
static const struct cifs_sid sid_everyone = {
1, 1, {0, 0, 0, 0, 0, 1}, {0} };
+/* security id for Authenticated Users system group */
+static const struct cifs_sid sid_authusers = {
+ 1, 1, {0, 0, 0, 0, 0, 5}, {11} };
/* group users */
static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
@@ -365,7 +368,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
if (num_aces > 0) {
umode_t user_mask = S_IRWXU;
umode_t group_mask = S_IRWXG;
- umode_t other_mask = S_IRWXO;
+ umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
GFP_KERNEL);
@@ -390,6 +393,12 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
ppace[i]->type,
&fattr->cf_mode,
&other_mask);
+ if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+ access_flags_to_mode(ppace[i]->access_req,
+ ppace[i]->type,
+ &fattr->cf_mode,
+ &other_mask);
+
/* memcpy((void *)(&(cifscred->aces[i])),
(void *)ppace[i],
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e7075d5f139..99d777a03dd0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -77,7 +77,11 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
module_param(cifs_max_pending, int, 0);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
"Default: 50 Range: 2 to 256");
-
+unsigned short echo_retries = 5;
+module_param(echo_retries, ushort, 0644);
+MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and "
+ "reconnecting server. Default: 5. 0 means "
+ "never reconnect.");
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
@@ -174,6 +178,12 @@ cifs_read_super(struct super_block *sb, void *data,
goto out_no_root;
}
+ /* do that *after* d_alloc_root() - we want NULL ->d_op for root here */
+ if (cifs_sb_master_tcon(cifs_sb)->nocase)
+ sb->s_d_op = &cifs_ci_dentry_ops;
+ else
+ sb->s_d_op = &cifs_dentry_ops;
+
#ifdef CONFIG_CIFS_EXPERIMENTAL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cFYI(1, "export ops supported");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 897b2b2b28b5..4739a531cded 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -93,6 +93,12 @@ extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
extern const struct dentry_operations cifs_dentry_ops;
extern const struct dentry_operations cifs_ci_dentry_ops;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
+#else
+#define cifs_dfs_d_automount NULL
+#endif
+
/* Functions related to symlinks */
extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
extern void cifs_put_link(struct dentry *direntry,
@@ -112,5 +118,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* EXPERIMENTAL */
-#define CIFS_VERSION "1.68"
+#define CIFS_VERSION "1.69"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 606ca8bb7102..571132c95231 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -218,6 +218,7 @@ struct TCP_Server_Info {
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_ntlmssp; /* supports NTLMSSP */
bool session_estab; /* mark when very first sess is established */
+ struct delayed_work echo; /* echo ping workqueue job */
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
#endif
@@ -508,6 +509,18 @@ static inline void cifs_stats_bytes_read(struct cifsTconInfo *tcon,
#endif
+struct mid_q_entry;
+
+/*
+ * This is the prototype for the mid callback function. When creating one,
+ * take special care to avoid deadlocks. Things to bear in mind:
+ *
+ * - it will be called by cifsd
+ * - the GlobalMid_Lock will be held
+ * - the mid will be removed from the pending_mid_q list
+ */
+typedef void (mid_callback_t)(struct mid_q_entry *mid);
+
/* one of these for every pending CIFS request to the server */
struct mid_q_entry {
struct list_head qhead; /* mids waiting on reply from this server */
@@ -519,7 +532,8 @@ struct mid_q_entry {
unsigned long when_sent; /* time when smb send finished */
unsigned long when_received; /* when demux complete (taken off wire) */
#endif
- struct task_struct *tsk; /* task waiting for response */
+ mid_callback_t *callback; /* call completion callback */
+ void *callback_data; /* general purpose pointer for callback */
struct smb_hdr *resp_buf; /* response buffer */
int midState; /* wish this were enum but can not pass to wait_event */
__u8 command; /* smb command code */
@@ -622,12 +636,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
#define CIFS_IOVEC 4 /* array of response buffers */
/* Type of Request to SendReceive2 */
-#define CIFS_STD_OP 0 /* normal request timeout */
-#define CIFS_LONG_OP 1 /* long op (up to 45 sec, oplock time) */
-#define CIFS_VLONG_OP 2 /* sloow op - can take up to 180 seconds */
-#define CIFS_BLOCKING_OP 4 /* operation can block */
-#define CIFS_ASYNC_OP 8 /* do not wait for response */
-#define CIFS_TIMEOUT_MASK 0x00F /* only one of 5 above set in req */
+#define CIFS_BLOCKING_OP 1 /* operation can block */
+#define CIFS_ASYNC_OP 2 /* do not wait for response */
+#define CIFS_TIMEOUT_MASK 0x003 /* only one of above set in req */
#define CIFS_LOG_ERROR 0x010 /* log NT STATUS if non-zero */
#define CIFS_LARGE_BUF_OP 0x020 /* large request buffer */
#define CIFS_NO_RESP 0x040 /* no response buffer required */
@@ -790,6 +801,9 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
+/* reconnect after this many failed echo attempts */
+GLOBAL_EXTERN unsigned short echo_retries;
+
void cifs_oplock_break(struct work_struct *work);
void cifs_oplock_break_get(struct cifsFileInfo *cfile);
void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index de36b09763a8..ea205b4fcad2 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -50,6 +50,7 @@
#define SMB_COM_SETATTR 0x09 /* trivial response */
#define SMB_COM_LOCKING_ANDX 0x24 /* trivial response */
#define SMB_COM_COPY 0x29 /* trivial rsp, fail filename ignrd*/
+#define SMB_COM_ECHO 0x2B /* echo request */
#define SMB_COM_OPEN_ANDX 0x2D /* Legacy open for old servers */
#define SMB_COM_READ_ANDX 0x2E
#define SMB_COM_WRITE_ANDX 0x2F
@@ -760,6 +761,20 @@ typedef struct smb_com_tconx_rsp_ext {
*
*/
+typedef struct smb_com_echo_req {
+ struct smb_hdr hdr;
+ __le16 EchoCount;
+ __le16 ByteCount;
+ char Data[1];
+} __attribute__((packed)) ECHO_REQ;
+
+typedef struct smb_com_echo_rsp {
+ struct smb_hdr hdr;
+ __le16 SequenceNumber;
+ __le16 ByteCount;
+ char Data[1];
+} __attribute__((packed)) ECHO_RSP;
+
typedef struct smb_com_logoff_andx_req {
struct smb_hdr hdr; /* wct = 2 */
__u8 AndXCommand;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e6d1481b16c1..982895fa7615 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -61,6 +61,12 @@ extern char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath, const struct dfs_info3_param *ref,
char **devname);
/* extern void renew_parental_timestamps(struct dentry *direntry);*/
+extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
+ struct TCP_Server_Info *server);
+extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
+extern int cifs_call_async(struct TCP_Server_Info *server,
+ struct smb_hdr *in_buf, mid_callback_t *callback,
+ void *cbdata);
extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
struct smb_hdr * /* input */ ,
struct smb_hdr * /* out */ ,
@@ -347,12 +353,13 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
const __u16 netfid, const __u64 len,
const __u64 offset, const __u32 numUnlock,
const __u32 numLock, const __u8 lockType,
- const bool waitFlag);
+ const bool waitFlag, const __u8 oplock_level);
extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
const __u16 smb_file_id, const int get_flag,
const __u64 len, struct file_lock *,
const __u16 lock_type, const bool waitFlag);
extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
+extern int CIFSSMBEcho(struct TCP_Server_Info *server);
extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
extern struct cifsSesInfo *sesInfoAlloc(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2f6795e524d3..37113450757b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -706,6 +706,53 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
return rc;
}
+/*
+ * This is a no-op for now. We're not really interested in the reply, but
+ * rather in the fact that the server sent one and that server->lstrp
+ * gets updated.
+ *
+ * FIXME: maybe we should consider checking that the reply matches request?
+ */
+static void
+cifs_echo_callback(struct mid_q_entry *mid)
+{
+ struct TCP_Server_Info *server = mid->callback_data;
+
+ DeleteMidQEntry(mid);
+ atomic_dec(&server->inFlight);
+ wake_up(&server->request_q);
+}
+
+int
+CIFSSMBEcho(struct TCP_Server_Info *server)
+{
+ ECHO_REQ *smb;
+ int rc = 0;
+
+ cFYI(1, "In echo request");
+
+ rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
+ if (rc)
+ return rc;
+
+ /* set up echo request */
+ smb->hdr.Tid = cpu_to_le16(0xffff);
+ smb->hdr.WordCount = cpu_to_le16(1);
+ smb->EchoCount = cpu_to_le16(1);
+ smb->ByteCount = cpu_to_le16(1);
+ smb->Data[0] = 'a';
+ smb->hdr.smb_buf_length += 3;
+
+ rc = cifs_call_async(server, (struct smb_hdr *)smb,
+ cifs_echo_callback, server);
+ if (rc)
+ cFYI(1, "Echo request failed: %d", rc);
+
+ cifs_small_buf_release(smb);
+
+ return rc;
+}
+
int
CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
{
@@ -1193,7 +1240,7 @@ OldOpenRetry:
pSMB->ByteCount = cpu_to_le16(count);
/* long_op set to 1 to allow for oplock break timeouts */
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+ (struct smb_hdr *)pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->num_opens);
if (rc) {
cFYI(1, "Error in Open = %d", rc);
@@ -1306,7 +1353,7 @@ openRetry:
pSMB->ByteCount = cpu_to_le16(count);
/* long_op set to 1 to allow for oplock break timeouts */
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
+ (struct smb_hdr *)pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->num_opens);
if (rc) {
cFYI(1, "Error in Open = %d", rc);
@@ -1388,7 +1435,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
iov[0].iov_base = (char *)pSMB;
iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
- &resp_buf_type, CIFS_STD_OP | CIFS_LOG_ERROR);
+ &resp_buf_type, CIFS_LOG_ERROR);
cifs_stats_inc(&tcon->num_reads);
pSMBr = (READ_RSP *)iov[0].iov_base;
if (rc) {
@@ -1663,7 +1710,8 @@ int
CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
const __u16 smb_file_id, const __u64 len,
const __u64 offset, const __u32 numUnlock,
- const __u32 numLock, const __u8 lockType, const bool waitFlag)
+ const __u32 numLock, const __u8 lockType,
+ const bool waitFlag, const __u8 oplock_level)
{
int rc = 0;
LOCK_REQ *pSMB = NULL;
@@ -1691,6 +1739,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
pSMB->NumberOfLocks = cpu_to_le16(numLock);
pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock);
pSMB->LockType = lockType;
+ pSMB->OplockLevel = oplock_level;
pSMB->AndXCommand = 0xFF; /* none */
pSMB->Fid = smb_file_id; /* netfid stays le */
@@ -3087,7 +3136,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
- CIFS_STD_OP);
+ 0);
cifs_stats_inc(&tcon->num_acl_get);
if (rc) {
cFYI(1, "Send error in QuerySecDesc = %d", rc);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a65d311d163a..8d4657596301 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -52,6 +52,9 @@
#define CIFS_PORT 445
#define RFC1001_PORT 139
+/* SMB echo "timeout" -- FIXME: tunable? */
+#define SMB_ECHO_INTERVAL (60 * HZ)
+
extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
unsigned char *p24);
@@ -152,6 +155,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* before reconnecting the tcp session, mark the smb session (uid)
and the tid bad so they are not used until reconnected */
+ cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
@@ -163,7 +167,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
}
spin_unlock(&cifs_tcp_ses_lock);
+
/* do not want to be sending data on a socket we are freeing */
+ cFYI(1, "%s: tearing down socket", __func__);
mutex_lock(&server->srv_mutex);
if (server->ssocket) {
cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
@@ -180,22 +186,20 @@ cifs_reconnect(struct TCP_Server_Info *server)
kfree(server->session_key.response);
server->session_key.response = NULL;
server->session_key.len = 0;
+ server->lstrp = jiffies;
+ mutex_unlock(&server->srv_mutex);
+ /* mark submitted MIDs for retry and issue callback */
+ cFYI(1, "%s: issuing mid callbacks", __func__);
spin_lock(&GlobalMid_Lock);
- list_for_each(tmp, &server->pending_mid_q) {
- mid_entry = list_entry(tmp, struct
- mid_q_entry,
- qhead);
- if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
- /* Mark other intransit requests as needing
- retry so we do not immediately mark the
- session bad again (ie after we reconnect
- below) as they timeout too */
+ list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+ mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+ if (mid_entry->midState == MID_REQUEST_SUBMITTED)
mid_entry->midState = MID_RETRY_NEEDED;
- }
+ list_del_init(&mid_entry->qhead);
+ mid_entry->callback(mid_entry);
}
spin_unlock(&GlobalMid_Lock);
- mutex_unlock(&server->srv_mutex);
while ((server->tcpStatus != CifsExiting) &&
(server->tcpStatus != CifsGood)) {
@@ -212,10 +216,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsGood;
spin_unlock(&GlobalMid_Lock);
- /* atomic_set(&server->inFlight,0);*/
- wake_up(&server->response_q);
}
}
+
return rc;
}
@@ -334,6 +337,26 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
}
+static void
+cifs_echo_request(struct work_struct *work)
+{
+ int rc;
+ struct TCP_Server_Info *server = container_of(work,
+ struct TCP_Server_Info, echo.work);
+
+ /* no need to ping if we got a response recently */
+ if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+ goto requeue_echo;
+
+ rc = CIFSSMBEcho(server);
+ if (rc)
+ cFYI(1, "Unable to send echo request to server: %s",
+ server->hostname);
+
+requeue_echo:
+ queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
+}
+
static int
cifs_demultiplex_thread(struct TCP_Server_Info *server)
{
@@ -345,8 +368,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
struct msghdr smb_msg;
struct kvec iov;
struct socket *csocket = server->ssocket;
- struct list_head *tmp;
- struct cifsSesInfo *ses;
+ struct list_head *tmp, *tmp2;
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mid_entry;
char temp;
@@ -399,7 +421,20 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
smb_msg.msg_control = NULL;
smb_msg.msg_controllen = 0;
pdu_length = 4; /* enough to get RFC1001 header */
+
incomplete_rcv:
+ if (echo_retries > 0 &&
+ time_after(jiffies, server->lstrp +
+ (echo_retries * SMB_ECHO_INTERVAL))) {
+ cERROR(1, "Server %s has not responded in %d seconds. "
+ "Reconnecting...", server->hostname,
+ (echo_retries * SMB_ECHO_INTERVAL / HZ));
+ cifs_reconnect(server);
+ csocket = server->ssocket;
+ wake_up(&server->response_q);
+ continue;
+ }
+
length =
kernel_recvmsg(csocket, &smb_msg,
&iov, 1, pdu_length, 0 /* BB other flags? */);
@@ -559,10 +594,11 @@ incomplete_rcv:
continue;
}
+ mid_entry = NULL;
+ server->lstrp = jiffies;
- task_to_wake = NULL;
spin_lock(&GlobalMid_Lock);
- list_for_each(tmp, &server->pending_mid_q) {
+ list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
if ((mid_entry->mid == smb_buffer->Mid) &&
@@ -603,20 +639,19 @@ incomplete_rcv:
mid_entry->resp_buf = smb_buffer;
mid_entry->largeBuf = isLargeBuf;
multi_t2_fnd:
- task_to_wake = mid_entry->tsk;
mid_entry->midState = MID_RESPONSE_RECEIVED;
+ list_del_init(&mid_entry->qhead);
+ mid_entry->callback(mid_entry);
#ifdef CONFIG_CIFS_STATS2
mid_entry->when_received = jiffies;
#endif
- /* so we do not time out requests to server
- which is still responding (since server could
- be busy but not dead) */
- server->lstrp = jiffies;
break;
}
+ mid_entry = NULL;
}
spin_unlock(&GlobalMid_Lock);
- if (task_to_wake) {
+
+ if (mid_entry != NULL) {
/* Was previous buf put in mpx struct for multi-rsp? */
if (!isMultiRsp) {
/* smb buffer will be freed by user thread */
@@ -625,11 +660,10 @@ multi_t2_fnd:
else
smallbuf = NULL;
}
- wake_up_process(task_to_wake);
} else if (!is_valid_oplock_break(smb_buffer, server) &&
!isMultiRsp) {
cERROR(1, "No task to wake, unknown frame received! "
- "NumMids %d", midCount.counter);
+ "NumMids %d", atomic_read(&midCount));
cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
sizeof(struct smb_hdr));
#ifdef CONFIG_CIFS_DEBUG2
@@ -677,44 +711,16 @@ multi_t2_fnd:
if (smallbuf) /* no sense logging a debug message if NULL */
cifs_small_buf_release(smallbuf);
- /*
- * BB: we shouldn't have to do any of this. It shouldn't be
- * possible to exit from the thread with active SMB sessions
- */
- spin_lock(&cifs_tcp_ses_lock);
- if (list_empty(&server->pending_mid_q)) {
- /* loop through server session structures attached to this and
- mark them dead */
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifsSesInfo,
- smb_ses_list);
- ses->status = CifsExiting;
- ses->server = NULL;
- }
- spin_unlock(&cifs_tcp_ses_lock);
- } else {
- /* although we can not zero the server struct pointer yet,
- since there are active requests which may depnd on them,
- mark the corresponding SMB sessions as exiting too */
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifsSesInfo,
- smb_ses_list);
- ses->status = CifsExiting;
- }
-
+ if (!list_empty(&server->pending_mid_q)) {
spin_lock(&GlobalMid_Lock);
- list_for_each(tmp, &server->pending_mid_q) {
- mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
- cFYI(1, "Clearing Mid 0x%x - waking up ",
+ list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
+ mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+ cFYI(1, "Clearing Mid 0x%x - issuing callback",
mid_entry->mid);
- task_to_wake = mid_entry->tsk;
- if (task_to_wake)
- wake_up_process(task_to_wake);
- }
+ list_del_init(&mid_entry->qhead);
+ mid_entry->callback(mid_entry);
}
spin_unlock(&GlobalMid_Lock);
- spin_unlock(&cifs_tcp_ses_lock);
/* 1/8th of sec is more than enough time for them to exit */
msleep(125);
}
@@ -732,18 +738,6 @@ multi_t2_fnd:
coming home not much else we can do but free the memory */
}
- /* last chance to mark ses pointers invalid
- if there are any pointing to this (e.g
- if a crazy root user tried to kill cifsd
- kernel thread explicitly this might happen) */
- /* BB: This shouldn't be necessary, see above */
- spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
- ses->server = NULL;
- }
- spin_unlock(&cifs_tcp_ses_lock);
-
kfree(server->hostname);
task_to_wake = xchg(&server->tsk, NULL);
kfree(server);
@@ -1113,6 +1107,8 @@ cifs_parse_mount_options(char *options, const char *devname,
} else if (!strnicmp(data, "uid", 3) && value && *value) {
vol->linux_uid = simple_strtoul(value, &value, 0);
uid_specified = true;
+ } else if (!strnicmp(data, "cruid", 5) && value && *value) {
+ vol->cred_uid = simple_strtoul(value, &value, 0);
} else if (!strnicmp(data, "forceuid", 8)) {
override_uid = 1;
} else if (!strnicmp(data, "noforceuid", 10)) {
@@ -1610,6 +1606,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
list_del_init(&server->tcp_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
+ cancel_delayed_work_sync(&server->echo);
+
spin_lock(&GlobalMid_Lock);
server->tcpStatus = CifsExiting;
spin_unlock(&GlobalMid_Lock);
@@ -1699,8 +1697,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
+ tcp_ses->lstrp = jiffies;
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+ INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
/*
* at this point we are the only ones with the pointer
@@ -1749,6 +1749,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
cifs_fscache_get_client_cookie(tcp_ses);
+ /* queue echo request delayed work */
+ queue_delayed_work(system_nrt_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+
return tcp_ses;
out_err_crypto_release:
@@ -2963,7 +2966,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
bcc_ptr++; /* skip password */
/* already aligned so no need to do it below */
} else {
- pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+ pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
/* BB FIXME add code to fail this if NTLMv2 or Kerberos
specified as required (when that support is added to
the vfs in the future) as only NTLM or the much
@@ -2981,7 +2984,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
#endif /* CIFS_WEAK_PW_HASH */
SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
- bcc_ptr += CIFS_SESS_KEY_SIZE;
+ bcc_ptr += CIFS_AUTH_RESP_SIZE;
if (ses->capabilities & CAP_UNICODE) {
/* must align unicode strings */
*bcc_ptr = 0; /* null byte password */
@@ -3019,7 +3022,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
pSMB->ByteCount = cpu_to_le16(count);
rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
- CIFS_STD_OP);
+ 0);
/* above now done in SendReceive */
if ((rc == 0) && (tcon != NULL)) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2e773825835e..dd5f22918c33 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -130,17 +130,6 @@ cifs_bp_rename_retry:
return full_path;
}
-static void setup_cifs_dentry(struct cifsTconInfo *tcon,
- struct dentry *direntry,
- struct inode *newinode)
-{
- if (tcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
- d_instantiate(direntry, newinode);
-}
-
/* Inode operations in similar order to how they appear in Linux file fs.h */
int
@@ -327,7 +316,7 @@ cifs_create_get_file_info:
cifs_create_set_dentry:
if (rc == 0)
- setup_cifs_dentry(tcon, direntry, newinode);
+ d_instantiate(direntry, newinode);
else
cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
@@ -418,10 +407,6 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
- if (pTcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
if (rc == 0)
d_instantiate(direntry, newinode);
@@ -601,10 +586,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
parent_dir_inode->i_sb, xid, NULL);
if ((rc == 0) && (newInode != NULL)) {
- if (pTcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
d_add(direntry, newInode);
if (posix_open) {
filp = lookup_instantiate_filp(nd, direntry,
@@ -631,10 +612,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
} else if (rc == -ENOENT) {
rc = 0;
direntry->d_time = jiffies;
- if (pTcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
d_add(direntry, NULL);
/* if it was once a directory (but how can we tell?) we could do
shrink_dcache_parent(direntry); */
@@ -698,6 +675,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
const struct dentry_operations cifs_dentry_ops = {
.d_revalidate = cifs_d_revalidate,
+ .d_automount = cifs_dfs_d_automount,
/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
};
@@ -734,4 +712,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
.d_revalidate = cifs_d_revalidate,
.d_hash = cifs_ci_hash,
.d_compare = cifs_ci_compare,
+ .d_automount = cifs_dfs_d_automount,
};
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d843631c028d..bd2a028af833 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -726,12 +726,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
/* BB we could chain these into one lock request BB */
rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
- 0, 1, lockType, 0 /* wait flag */ );
+ 0, 1, lockType, 0 /* wait flag */, 0);
if (rc == 0) {
rc = CIFSSMBLock(xid, tcon, netfid, length,
pfLock->fl_start, 1 /* numUnlock */ ,
0 /* numLock */ , lockType,
- 0 /* wait flag */ );
+ 0 /* wait flag */, 0);
pfLock->fl_type = F_UNLCK;
if (rc != 0)
cERROR(1, "Error unlocking previously locked "
@@ -748,13 +748,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
rc = CIFSSMBLock(xid, tcon, netfid, length,
pfLock->fl_start, 0, 1,
lockType | LOCKING_ANDX_SHARED_LOCK,
- 0 /* wait flag */);
+ 0 /* wait flag */, 0);
if (rc == 0) {
rc = CIFSSMBLock(xid, tcon, netfid,
length, pfLock->fl_start, 1, 0,
lockType |
LOCKING_ANDX_SHARED_LOCK,
- 0 /* wait flag */);
+ 0 /* wait flag */, 0);
pfLock->fl_type = F_RDLCK;
if (rc != 0)
cERROR(1, "Error unlocking "
@@ -797,8 +797,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
if (numLock) {
rc = CIFSSMBLock(xid, tcon, netfid, length,
- pfLock->fl_start,
- 0, numLock, lockType, wait_flag);
+ pfLock->fl_start, 0, numLock, lockType,
+ wait_flag, 0);
if (rc == 0) {
/* For Windows locks we must store them. */
@@ -818,9 +818,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
(pfLock->fl_start + length) >=
(li->offset + li->length)) {
stored_rc = CIFSSMBLock(xid, tcon,
- netfid,
- li->length, li->offset,
- 1, 0, li->type, false);
+ netfid, li->length,
+ li->offset, 1, 0,
+ li->type, false, 0);
if (stored_rc)
rc = stored_rc;
else {
@@ -839,29 +839,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
return rc;
}
-/*
- * Set the timeout on write requests past EOF. For some servers (Windows)
- * these calls can be very long.
- *
- * If we're writing >10M past the EOF we give a 180s timeout. Anything less
- * than that gets a 45s timeout. Writes not past EOF get 15s timeouts.
- * The 10M cutoff is totally arbitrary. A better scheme for this would be
- * welcome if someone wants to suggest one.
- *
- * We may be able to do a better job with this if there were some way to
- * declare that a file should be sparse.
- */
-static int
-cifs_write_timeout(struct cifsInodeInfo *cifsi, loff_t offset)
-{
- if (offset <= cifsi->server_eof)
- return CIFS_STD_OP;
- else if (offset > (cifsi->server_eof + (10 * 1024 * 1024)))
- return CIFS_VLONG_OP;
- else
- return CIFS_LONG_OP;
-}
-
/* update the file size (if needed) after a write */
static void
cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
@@ -882,7 +859,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
unsigned int total_written;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *pTcon;
- int xid, long_op;
+ int xid;
struct cifsFileInfo *open_file;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
@@ -903,7 +880,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
xid = GetXid();
- long_op = cifs_write_timeout(cifsi, *poffset);
for (total_written = 0; write_size > total_written;
total_written += bytes_written) {
rc = -EAGAIN;
@@ -931,7 +907,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
min_t(const int, cifs_sb->wsize,
write_size - total_written),
*poffset, &bytes_written,
- NULL, write_data + total_written, long_op);
+ NULL, write_data + total_written, 0);
}
if (rc || (bytes_written == 0)) {
if (total_written)
@@ -944,8 +920,6 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
cifs_update_eof(cifsi, *poffset, bytes_written);
*poffset += bytes_written;
}
- long_op = CIFS_STD_OP; /* subsequent writes fast -
- 15 seconds is plenty */
}
cifs_stats_bytes_written(pTcon, total_written);
@@ -974,7 +948,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
unsigned int total_written;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *pTcon;
- int xid, long_op;
+ int xid;
struct dentry *dentry = open_file->dentry;
struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
@@ -987,7 +961,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
xid = GetXid();
- long_op = cifs_write_timeout(cifsi, *poffset);
for (total_written = 0; write_size > total_written;
total_written += bytes_written) {
rc = -EAGAIN;
@@ -1017,7 +990,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
rc = CIFSSMBWrite2(xid, pTcon,
open_file->netfid, len,
*poffset, &bytes_written,
- iov, 1, long_op);
+ iov, 1, 0);
} else
rc = CIFSSMBWrite(xid, pTcon,
open_file->netfid,
@@ -1025,7 +998,7 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
write_size - total_written),
*poffset, &bytes_written,
write_data + total_written,
- NULL, long_op);
+ NULL, 0);
}
if (rc || (bytes_written == 0)) {
if (total_written)
@@ -1038,8 +1011,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
cifs_update_eof(cifsi, *poffset, bytes_written);
*poffset += bytes_written;
}
- long_op = CIFS_STD_OP; /* subsequent writes fast -
- 15 seconds is plenty */
}
cifs_stats_bytes_written(pTcon, total_written);
@@ -1239,7 +1210,7 @@ static int cifs_writepages(struct address_space *mapping,
struct pagevec pvec;
int rc = 0;
int scanned = 0;
- int xid, long_op;
+ int xid;
cifs_sb = CIFS_SB(mapping->host->i_sb);
@@ -1377,43 +1348,67 @@ retry:
break;
}
if (n_iov) {
+retry_write:
open_file = find_writable_file(CIFS_I(mapping->host),
false);
if (!open_file) {
cERROR(1, "No writable handles for inode");
rc = -EBADF;
} else {
- long_op = cifs_write_timeout(cifsi, offset);
rc = CIFSSMBWrite2(xid, tcon, open_file->netfid,
bytes_to_write, offset,
&bytes_written, iov, n_iov,
- long_op);
+ 0);
cifsFileInfo_put(open_file);
- cifs_update_eof(cifsi, offset, bytes_written);
}
- if (rc || bytes_written < bytes_to_write) {
- cERROR(1, "Write2 ret %d, wrote %d",
- rc, bytes_written);
- mapping_set_error(mapping, rc);
- } else {
+ cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
+
+ /*
+ * For now, treat a short write as if nothing got
+ * written. A zero length write however indicates
+ * ENOSPC or EFBIG. We have no way to know which
+ * though, so call it ENOSPC for now. EFBIG would
+ * get translated to AS_EIO anyway.
+ *
+ * FIXME: make it take into account the data that did
+ * get written
+ */
+ if (rc == 0) {
+ if (bytes_written == 0)
+ rc = -ENOSPC;
+ else if (bytes_written < bytes_to_write)
+ rc = -EAGAIN;
+ }
+
+ /* retry on data-integrity flush */
+ if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
+ goto retry_write;
+
+ /* fix the stats and EOF */
+ if (bytes_written > 0) {
cifs_stats_bytes_written(tcon, bytes_written);
+ cifs_update_eof(cifsi, offset, bytes_written);
}
for (i = 0; i < n_iov; i++) {
page = pvec.pages[first + i];
- /* Should we also set page error on
- success rc but too little data written? */
- /* BB investigate retry logic on temporary
- server crash cases and how recovery works
- when page marked as error */
- if (rc)
+ /* on retryable write error, redirty page */
+ if (rc == -EAGAIN)
+ redirty_page_for_writepage(wbc, page);
+ else if (rc != 0)
SetPageError(page);
kunmap(page);
unlock_page(page);
end_page_writeback(page);
page_cache_release(page);
}
+
+ if (rc != -EAGAIN)
+ mapping_set_error(mapping, rc);
+ else
+ rc = 0;
+
if ((wbc->nr_to_write -= n_iov) <= 0)
done = 1;
index = next;
@@ -2192,7 +2187,8 @@ void cifs_oplock_break(struct work_struct *work)
*/
if (!cfile->oplock_break_cancelled) {
rc = CIFSSMBLock(0, tlink_tcon(cfile->tlink), cfile->netfid, 0,
- 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false);
+ 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false,
+ cinode->clientCanCacheRead ? 1 : 0);
cFYI(1, "Oplock release rc = %d", rc);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c7e36910e31..6c9ee8014ff0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -32,7 +32,7 @@
#include "fscache.h"
-static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
+static void cifs_set_ops(struct inode *inode)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -60,7 +60,7 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
break;
case S_IFDIR:
#ifdef CONFIG_CIFS_DFS_UPCALL
- if (is_dfs_referral) {
+ if (IS_AUTOMOUNT(inode)) {
inode->i_op = &cifs_dfs_referral_inode_operations;
} else {
#else /* NO DFS support, treat as a directory */
@@ -167,7 +167,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
}
spin_unlock(&inode->i_lock);
- cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
+ if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+ inode->i_flags |= S_AUTOMOUNT;
+ cifs_set_ops(inode);
}
void
@@ -1324,10 +1326,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
to set uid/gid */
inc_nlink(inode);
- if (pTcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
cifs_fill_uniqueid(inode->i_sb, &fattr);
@@ -1368,10 +1366,6 @@ mkdir_get_info:
rc = cifs_get_inode_info(&newinode, full_path, NULL,
inode->i_sb, xid, NULL);
- if (pTcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
d_instantiate(direntry, newinode);
/* setting nlink not necessary except in cases where we
* failed to get it from the server or was set bogus */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fe2f6a93c49e..306769de2fb5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -524,10 +524,6 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
rc);
} else {
- if (pTcon->nocase)
- d_set_d_op(direntry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(direntry, &cifs_dentry_ops);
d_instantiate(direntry, newinode);
}
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 43f10281bc19..09bfcf08a90f 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -571,7 +571,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
pCifsInode = CIFS_I(netfile->dentry->d_inode);
cifs_set_oplock_level(pCifsInode,
- pSMB->OplockLevel);
+ pSMB->OplockLevel ? OPLOCK_READ : 0);
/*
* cifs_oplock_break_put() can't be called
* from here. Get reference after queueing
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 9aad47a2d62f..6783ce6cdc89 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -899,8 +899,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
}
/* else ERRHRD class errors or junk - return EIO */
- cFYI(1, "Mapping smb error code %d to POSIX err %d",
- smberrcode, rc);
+ cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+ le32_to_cpu(smb->Status.CifsError), rc);
/* generic corrective action e.g. reconnect SMB session on
* ERRbaduid could be added */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 76b1b37c9e6b..7f25cc3d2256 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -102,11 +102,6 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
return NULL;
}
- if (cifs_sb_master_tcon(CIFS_SB(sb))->nocase)
- d_set_d_op(dentry, &cifs_ci_dentry_ops);
- else
- d_set_d_op(dentry, &cifs_dentry_ops);
-
alias = d_materialise_unique(dentry, inode);
if (alias != NULL) {
dput(dentry);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index eb746486e49e..1cffd82c4f13 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -879,7 +879,7 @@ ssetup_ntlmssp_authenticate:
BCC_LE(smb_buf) = cpu_to_le16(count);
rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
- CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
+ CIFS_LOG_ERROR);
/* SMB request buf freed in SendReceive2 */
pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 59ca81b16919..c8e2808cd5e6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -36,7 +36,13 @@
extern mempool_t *cifs_mid_poolp;
-static struct mid_q_entry *
+static void
+wake_up_task(struct mid_q_entry *mid)
+{
+ wake_up_process(mid->callback_data);
+}
+
+struct mid_q_entry *
AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
@@ -58,28 +64,28 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
/* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
/* when mid allocated can be before when sent */
temp->when_alloc = jiffies;
- temp->tsk = current;
+
+ /*
+ * The default is for the mid to be synchronous, so the
+ * default callback just wakes up the current task.
+ */
+ temp->callback = wake_up_task;
+ temp->callback_data = current;
}
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&temp->qhead, &server->pending_mid_q);
atomic_inc(&midCount);
temp->midState = MID_REQUEST_ALLOCATED;
- spin_unlock(&GlobalMid_Lock);
return temp;
}
-static void
+void
DeleteMidQEntry(struct mid_q_entry *midEntry)
{
#ifdef CONFIG_CIFS_STATS2
unsigned long now;
#endif
- spin_lock(&GlobalMid_Lock);
midEntry->midState = MID_FREE;
- list_del(&midEntry->qhead);
atomic_dec(&midCount);
- spin_unlock(&GlobalMid_Lock);
if (midEntry->largeBuf)
cifs_buf_release(midEntry->resp_buf);
else
@@ -103,6 +109,16 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
mempool_free(midEntry, cifs_mid_poolp);
}
+static void
+delete_mid(struct mid_q_entry *mid)
+{
+ spin_lock(&GlobalMid_Lock);
+ list_del(&mid->qhead);
+ spin_unlock(&GlobalMid_Lock);
+
+ DeleteMidQEntry(mid);
+}
+
static int
smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
{
@@ -244,31 +260,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
return smb_sendv(server, &iov, 1);
}
-static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
+static int wait_for_free_request(struct TCP_Server_Info *server,
+ const int long_op)
{
if (long_op == CIFS_ASYNC_OP) {
/* oplock breaks must not be held up */
- atomic_inc(&ses->server->inFlight);
+ atomic_inc(&server->inFlight);
return 0;
}
spin_lock(&GlobalMid_Lock);
while (1) {
- if (atomic_read(&ses->server->inFlight) >=
- cifs_max_pending){
+ if (atomic_read(&server->inFlight) >= cifs_max_pending) {
spin_unlock(&GlobalMid_Lock);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&ses->server->num_waiters);
+ atomic_inc(&server->num_waiters);
#endif
- wait_event(ses->server->request_q,
- atomic_read(&ses->server->inFlight)
+ wait_event(server->request_q,
+ atomic_read(&server->inFlight)
< cifs_max_pending);
#ifdef CONFIG_CIFS_STATS2
- atomic_dec(&ses->server->num_waiters);
+ atomic_dec(&server->num_waiters);
#endif
spin_lock(&GlobalMid_Lock);
} else {
- if (ses->server->tcpStatus == CifsExiting) {
+ if (server->tcpStatus == CifsExiting) {
spin_unlock(&GlobalMid_Lock);
return -ENOENT;
}
@@ -278,7 +294,7 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
/* update # of requests on the wire to server */
if (long_op != CIFS_BLOCKING_OP)
- atomic_inc(&ses->server->inFlight);
+ atomic_inc(&server->inFlight);
spin_unlock(&GlobalMid_Lock);
break;
}
@@ -308,53 +324,81 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
*ppmidQ = AllocMidQEntry(in_buf, ses->server);
if (*ppmidQ == NULL)
return -ENOMEM;
+ spin_lock(&GlobalMid_Lock);
+ list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+ spin_unlock(&GlobalMid_Lock);
return 0;
}
-static int wait_for_response(struct cifsSesInfo *ses,
- struct mid_q_entry *midQ,
- unsigned long timeout,
- unsigned long time_to_wait)
+static int
+wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
- unsigned long curr_timeout;
+ int error;
- for (;;) {
- curr_timeout = timeout + jiffies;
- wait_event_timeout(ses->server->response_q,
- midQ->midState != MID_REQUEST_SUBMITTED, timeout);
+ error = wait_event_killable(server->response_q,
+ midQ->midState != MID_REQUEST_SUBMITTED);
+ if (error < 0)
+ return -ERESTARTSYS;
- if (time_after(jiffies, curr_timeout) &&
- (midQ->midState == MID_REQUEST_SUBMITTED) &&
- ((ses->server->tcpStatus == CifsGood) ||
- (ses->server->tcpStatus == CifsNew))) {
+ return 0;
+}
- unsigned long lrt;
- /* We timed out. Is the server still
- sending replies ? */
- spin_lock(&GlobalMid_Lock);
- lrt = ses->server->lstrp;
- spin_unlock(&GlobalMid_Lock);
+/*
+ * Send a SMB request and set the callback function in the mid to handle
+ * the result. Caller is responsible for dealing with timeouts.
+ */
+int
+cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+ mid_callback_t *callback, void *cbdata)
+{
+ int rc;
+ struct mid_q_entry *mid;
- /* Calculate time_to_wait past last receive time.
- Although we prefer not to time out if the
- server is still responding - we will time
- out if the server takes more than 15 (or 45
- or 180) seconds to respond to this request
- and has not responded to any request from
- other threads on the client within 10 seconds */
- lrt += time_to_wait;
- if (time_after(jiffies, lrt)) {
- /* No replies for time_to_wait. */
- cERROR(1, "server not responding");
- return -1;
- }
- } else {
- return 0;
- }
+ rc = wait_for_free_request(server, CIFS_ASYNC_OP);
+ if (rc)
+ return rc;
+
+ mutex_lock(&server->srv_mutex);
+ mid = AllocMidQEntry(in_buf, server);
+ if (mid == NULL) {
+ mutex_unlock(&server->srv_mutex);
+ return -ENOMEM;
}
-}
+ /* put it on the pending_mid_q */
+ spin_lock(&GlobalMid_Lock);
+ list_add_tail(&mid->qhead, &server->pending_mid_q);
+ spin_unlock(&GlobalMid_Lock);
+
+ rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+ if (rc) {
+ mutex_unlock(&server->srv_mutex);
+ goto out_err;
+ }
+
+ mid->callback = callback;
+ mid->callback_data = cbdata;
+ mid->midState = MID_REQUEST_SUBMITTED;
+#ifdef CONFIG_CIFS_STATS2
+ atomic_inc(&server->inSend);
+#endif
+ rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+#ifdef CONFIG_CIFS_STATS2
+ atomic_dec(&server->inSend);
+ mid->when_sent = jiffies;
+#endif
+ mutex_unlock(&server->srv_mutex);
+ if (rc)
+ goto out_err;
+
+ return rc;
+out_err:
+ delete_mid(mid);
+ atomic_dec(&server->inFlight);
+ wake_up(&server->request_q);
+ return rc;
+}
/*
*
@@ -382,6 +426,81 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
return rc;
}
+static int
+sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+{
+ int rc = 0;
+
+ cFYI(1, "%s: cmd=%d mid=%d state=%d", __func__, mid->command,
+ mid->mid, mid->midState);
+
+ spin_lock(&GlobalMid_Lock);
+ /* ensure that it's no longer on the pending_mid_q */
+ list_del_init(&mid->qhead);
+
+ switch (mid->midState) {
+ case MID_RESPONSE_RECEIVED:
+ spin_unlock(&GlobalMid_Lock);
+ return rc;
+ case MID_REQUEST_SUBMITTED:
+ /* socket is going down, reject all calls */
+ if (server->tcpStatus == CifsExiting) {
+ cERROR(1, "%s: canceling mid=%d cmd=0x%x state=%d",
+ __func__, mid->mid, mid->command, mid->midState);
+ rc = -EHOSTDOWN;
+ break;
+ }
+ case MID_RETRY_NEEDED:
+ rc = -EAGAIN;
+ break;
+ default:
+ cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
+ mid->mid, mid->midState);
+ rc = -EIO;
+ }
+ spin_unlock(&GlobalMid_Lock);
+
+ DeleteMidQEntry(mid);
+ return rc;
+}
+
+/*
+ * An NT cancel request header looks just like the original request except:
+ *
+ * The Command is SMB_COM_NT_CANCEL
+ * The WordCount is zeroed out
+ * The ByteCount is zeroed out
+ *
+ * This function mangles an existing request buffer into a
+ * SMB_COM_NT_CANCEL request and then sends it.
+ */
+static int
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
+ struct mid_q_entry *mid)
+{
+ int rc = 0;
+
+ /* -4 for RFC1001 length and +2 for BCC field */
+ in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2;
+ in_buf->Command = SMB_COM_NT_CANCEL;
+ in_buf->WordCount = 0;
+ BCC_LE(in_buf) = 0;
+
+ mutex_lock(&server->srv_mutex);
+ rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
+ if (rc) {
+ mutex_unlock(&server->srv_mutex);
+ return rc;
+ }
+ rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+ mutex_unlock(&server->srv_mutex);
+
+ cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
+ in_buf->Mid, rc);
+
+ return rc;
+}
+
int
SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
struct kvec *iov, int n_vec, int *pRespBufType /* ret */,
@@ -390,7 +509,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
int rc = 0;
int long_op;
unsigned int receive_len;
- unsigned long timeout;
struct mid_q_entry *midQ;
struct smb_hdr *in_buf = iov[0].iov_base;
@@ -413,7 +531,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
to the same server. We may make this configurable later or
use ses->maxReq */
- rc = wait_for_free_request(ses, long_op);
+ rc = wait_for_free_request(ses->server, long_op);
if (rc) {
cifs_small_buf_release(in_buf);
return rc;
@@ -457,65 +575,20 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
if (rc < 0)
goto out;
- if (long_op == CIFS_STD_OP)
- timeout = 15 * HZ;
- else if (long_op == CIFS_VLONG_OP) /* e.g. slow writes past EOF */
- timeout = 180 * HZ;
- else if (long_op == CIFS_LONG_OP)
- timeout = 45 * HZ; /* should be greater than
- servers oplock break timeout (about 43 seconds) */
- else if (long_op == CIFS_ASYNC_OP)
+ if (long_op == CIFS_ASYNC_OP)
goto out;
- else if (long_op == CIFS_BLOCKING_OP)
- timeout = 0x7FFFFFFF; /* large, but not so large as to wrap */
- else {
- cERROR(1, "unknown timeout flag %d", long_op);
- rc = -EIO;
- goto out;
- }
-
- /* wait for 15 seconds or until woken up due to response arriving or
- due to last connection to this server being unmounted */
- if (signal_pending(current)) {
- /* if signal pending do not hold up user for full smb timeout
- but we still give response a chance to complete */
- timeout = 2 * HZ;
- }
-
- /* No user interrupts in wait - wreaks havoc with performance */
- wait_for_response(ses, midQ, timeout, 10 * HZ);
-
- spin_lock(&GlobalMid_Lock);
- if (midQ->resp_buf == NULL) {
- cERROR(1, "No response to cmd %d mid %d",
- midQ->command, midQ->mid);
- if (midQ->midState == MID_REQUEST_SUBMITTED) {
- if (ses->server->tcpStatus == CifsExiting)
- rc = -EHOSTDOWN;
- else {
- ses->server->tcpStatus = CifsNeedReconnect;
- midQ->midState = MID_RETRY_NEEDED;
- }
- }
+ rc = wait_for_response(ses->server, midQ);
+ if (rc != 0)
+ goto out;
- if (rc != -EHOSTDOWN) {
- if (midQ->midState == MID_RETRY_NEEDED) {
- rc = -EAGAIN;
- cFYI(1, "marking request for retry");
- } else {
- rc = -EIO;
- }
- }
- spin_unlock(&GlobalMid_Lock);
- DeleteMidQEntry(midQ);
- /* Update # of requests on wire to server */
+ rc = sync_mid_result(midQ, ses->server);
+ if (rc != 0) {
atomic_dec(&ses->server->inFlight);
wake_up(&ses->server->request_q);
return rc;
}
- spin_unlock(&GlobalMid_Lock);
receive_len = midQ->resp_buf->smb_buf_length;
if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -564,14 +637,14 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
if ((flags & CIFS_NO_RESP) == 0)
midQ->resp_buf = NULL; /* mark it so buf will
not be freed by
- DeleteMidQEntry */
+ delete_mid */
} else {
rc = -EIO;
cFYI(1, "Bad MID state?");
}
out:
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
atomic_dec(&ses->server->inFlight);
wake_up(&ses->server->request_q);
@@ -585,7 +658,6 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
{
int rc = 0;
unsigned int receive_len;
- unsigned long timeout;
struct mid_q_entry *midQ;
if (ses == NULL) {
@@ -610,7 +682,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
return -EIO;
}
- rc = wait_for_free_request(ses, long_op);
+ rc = wait_for_free_request(ses->server, long_op);
if (rc)
return rc;
@@ -649,64 +721,20 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
if (rc < 0)
goto out;
- if (long_op == CIFS_STD_OP)
- timeout = 15 * HZ;
- /* wait for 15 seconds or until woken up due to response arriving or
- due to last connection to this server being unmounted */
- else if (long_op == CIFS_ASYNC_OP)
+ if (long_op == CIFS_ASYNC_OP)
goto out;
- else if (long_op == CIFS_VLONG_OP) /* writes past EOF can be slow */
- timeout = 180 * HZ;
- else if (long_op == CIFS_LONG_OP)
- timeout = 45 * HZ; /* should be greater than
- servers oplock break timeout (about 43 seconds) */
- else if (long_op == CIFS_BLOCKING_OP)
- timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
- else {
- cERROR(1, "unknown timeout flag %d", long_op);
- rc = -EIO;
- goto out;
- }
- if (signal_pending(current)) {
- /* if signal pending do not hold up user for full smb timeout
- but we still give response a chance to complete */
- timeout = 2 * HZ;
- }
-
- /* No user interrupts in wait - wreaks havoc with performance */
- wait_for_response(ses, midQ, timeout, 10 * HZ);
-
- spin_lock(&GlobalMid_Lock);
- if (midQ->resp_buf == NULL) {
- cERROR(1, "No response for cmd %d mid %d",
- midQ->command, midQ->mid);
- if (midQ->midState == MID_REQUEST_SUBMITTED) {
- if (ses->server->tcpStatus == CifsExiting)
- rc = -EHOSTDOWN;
- else {
- ses->server->tcpStatus = CifsNeedReconnect;
- midQ->midState = MID_RETRY_NEEDED;
- }
- }
+ rc = wait_for_response(ses->server, midQ);
+ if (rc != 0)
+ goto out;
- if (rc != -EHOSTDOWN) {
- if (midQ->midState == MID_RETRY_NEEDED) {
- rc = -EAGAIN;
- cFYI(1, "marking request for retry");
- } else {
- rc = -EIO;
- }
- }
- spin_unlock(&GlobalMid_Lock);
- DeleteMidQEntry(midQ);
- /* Update # of requests on wire to server */
+ rc = sync_mid_result(midQ, ses->server);
+ if (rc != 0) {
atomic_dec(&ses->server->inFlight);
wake_up(&ses->server->request_q);
return rc;
}
- spin_unlock(&GlobalMid_Lock);
receive_len = midQ->resp_buf->smb_buf_length;
if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
@@ -755,36 +783,13 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
}
out:
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
atomic_dec(&ses->server->inFlight);
wake_up(&ses->server->request_q);
return rc;
}
-/* Send an NT_CANCEL SMB to cause the POSIX blocking lock to return. */
-
-static int
-send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
- struct mid_q_entry *midQ)
-{
- int rc = 0;
- struct cifsSesInfo *ses = tcon->ses;
- __u16 mid = in_buf->Mid;
-
- header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
- in_buf->Mid = mid;
- mutex_lock(&ses->server->srv_mutex);
- rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
- if (rc) {
- mutex_unlock(&ses->server->srv_mutex);
- return rc;
- }
- rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
- mutex_unlock(&ses->server->srv_mutex);
- return rc;
-}
-
/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
blocking lock to return. */
@@ -807,7 +812,7 @@ send_lock_cancel(const unsigned int xid, struct cifsTconInfo *tcon,
pSMB->hdr.Mid = GetNextMid(ses->server);
return SendReceive(xid, ses, in_buf, out_buf,
- &bytes_returned, CIFS_STD_OP);
+ &bytes_returned, 0);
}
int
@@ -845,7 +850,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
return -EIO;
}
- rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
+ rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP);
if (rc)
return rc;
@@ -863,7 +868,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
if (rc) {
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
mutex_unlock(&ses->server->srv_mutex);
return rc;
}
@@ -880,7 +885,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
mutex_unlock(&ses->server->srv_mutex);
if (rc < 0) {
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
return rc;
}
@@ -899,10 +904,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
if (in_buf->Command == SMB_COM_TRANSACTION2) {
/* POSIX lock. We send a NT_CANCEL SMB to cause the
blocking lock to return. */
-
- rc = send_nt_cancel(tcon, in_buf, midQ);
+ rc = send_nt_cancel(ses->server, in_buf, midQ);
if (rc) {
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
return rc;
}
} else {
@@ -914,47 +918,22 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
/* If we get -ENOLCK back the lock may have
already been removed. Don't exit in this case. */
if (rc && rc != -ENOLCK) {
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
return rc;
}
}
- /* Wait 5 seconds for the response. */
- if (wait_for_response(ses, midQ, 5 * HZ, 5 * HZ) == 0) {
+ if (wait_for_response(ses->server, midQ) == 0) {
/* We got the response - restart system call. */
rstart = 1;
}
}
- spin_lock(&GlobalMid_Lock);
- if (midQ->resp_buf) {
- spin_unlock(&GlobalMid_Lock);
- receive_len = midQ->resp_buf->smb_buf_length;
- } else {
- cERROR(1, "No response for cmd %d mid %d",
- midQ->command, midQ->mid);
- if (midQ->midState == MID_REQUEST_SUBMITTED) {
- if (ses->server->tcpStatus == CifsExiting)
- rc = -EHOSTDOWN;
- else {
- ses->server->tcpStatus = CifsNeedReconnect;
- midQ->midState = MID_RETRY_NEEDED;
- }
- }
-
- if (rc != -EHOSTDOWN) {
- if (midQ->midState == MID_RETRY_NEEDED) {
- rc = -EAGAIN;
- cFYI(1, "marking request for retry");
- } else {
- rc = -EIO;
- }
- }
- spin_unlock(&GlobalMid_Lock);
- DeleteMidQEntry(midQ);
+ rc = sync_mid_result(midQ, ses->server);
+ if (rc != 0)
return rc;
- }
+ receive_len = midQ->resp_buf->smb_buf_length;
if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
cERROR(1, "Frame too large received. Length: %d Xid: %d",
receive_len, xid);
@@ -1001,7 +980,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
out:
- DeleteMidQEntry(midQ);
+ delete_mid(midQ);
if (rstart && rc == -EACCES)
return -ERESTARTSYS;
return rc;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 5525e1c660fd..690157876184 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -20,10 +20,9 @@
#include <linux/spinlock.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
#include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
static atomic_t permission_epoch = ATOMIC_INIT(0);
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 602240569c89..6475877b0763 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -7,9 +7,8 @@
#include <linux/time.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
#include <linux/coda_psdev.h>
+#include "coda_linux.h"
static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
{
diff --git a/fs/coda/coda_cache.h b/fs/coda/coda_cache.h
new file mode 100644
index 000000000000..c910b5eb1ceb
--- /dev/null
+++ b/fs/coda/coda_cache.h
@@ -0,0 +1,22 @@
+/* Coda filesystem -- Linux Minicache
+ *
+ * Copyright (C) 1989 - 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project. Contact Peter Braam
+ * <coda@cs.cmu.edu>
+ */
+
+#ifndef _CFSNC_HEADER_
+#define _CFSNC_HEADER_
+
+/* credential cache */
+void coda_cache_enter(struct inode *inode, int mask);
+void coda_cache_clear_inode(struct inode *);
+void coda_cache_clear_all(struct super_block *sb);
+int coda_cache_check(struct inode *inode, int mask);
+
+/* for downcalls and attributes and lookups */
+void coda_flag_inode_children(struct inode *inode, int flag);
+
+#endif /* _CFSNC_HEADER_ */
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
new file mode 100644
index 000000000000..e35071b1de0e
--- /dev/null
+++ b/fs/coda/coda_fs_i.h
@@ -0,0 +1,58 @@
+/*
+ * coda_fs_i.h
+ *
+ * Copyright (C) 1998 Carnegie Mellon University
+ *
+ */
+
+#ifndef _LINUX_CODA_FS_I
+#define _LINUX_CODA_FS_I
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/coda.h>
+
+/*
+ * coda fs inode data
+ * c_lock protects accesses to c_flags, c_mapcount, c_cached_epoch, c_uid and
+ * c_cached_perm.
+ * vfs_inode is set only when the inode is created and never changes.
+ * c_fid is set when the inode is created and should be considered immutable.
+ */
+struct coda_inode_info {
+ struct CodaFid c_fid; /* Coda identifier */
+ u_short c_flags; /* flags (see below) */
+ unsigned int c_mapcount; /* nr of times this inode is mapped */
+ unsigned int c_cached_epoch; /* epoch for cached permissions */
+ vuid_t c_uid; /* fsuid for cached permissions */
+ unsigned int c_cached_perm; /* cached access permissions */
+ spinlock_t c_lock;
+ struct inode vfs_inode;
+};
+
+/*
+ * coda fs file private data
+ */
+#define CODA_MAGIC 0xC0DAC0DA
+struct coda_file_info {
+ int cfi_magic; /* magic number */
+ struct file *cfi_container; /* container file for this cnode */
+ unsigned int cfi_mapcount; /* nr of times this file is mapped */
+};
+
+#define CODA_FTOC(file) ((struct coda_file_info *)((file)->private_data))
+
+/* flags */
+#define C_VATTR 0x1 /* Validity of vattr in inode */
+#define C_FLUSH 0x2 /* used after a flush */
+#define C_DYING 0x4 /* from venus (which died) */
+#define C_PURGE 0x8
+
+int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
+int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
+void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
+
+#endif
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index bf4a3fd3c8e3..2bdbcc11b373 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -17,9 +17,8 @@
#include <linux/string.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
#include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+#include "coda_linux.h"
/* initialize the debugging variables */
int coda_fake_statfs;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
new file mode 100644
index 000000000000..9b0c5323890b
--- /dev/null
+++ b/fs/coda/coda_linux.h
@@ -0,0 +1,101 @@
+/*
+ * Coda File System, Linux Kernel module
+ *
+ * Original version, adapted from cfs_mach.c, (C) Carnegie Mellon University
+ * Linux modifications (C) 1996, Peter J. Braam
+ * Rewritten for Linux 2.1 (C) 1997 Carnegie Mellon University
+ *
+ * Carnegie Mellon University encourages users of this software to
+ * contribute improvements to the Coda project.
+ */
+
+#ifndef _LINUX_CODA_FS
+#define _LINUX_CODA_FS
+
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include "coda_fs_i.h"
+
+/* operations */
+extern const struct inode_operations coda_dir_inode_operations;
+extern const struct inode_operations coda_file_inode_operations;
+extern const struct inode_operations coda_ioctl_inode_operations;
+
+extern const struct dentry_operations coda_dentry_operations;
+
+extern const struct address_space_operations coda_file_aops;
+extern const struct address_space_operations coda_symlink_aops;
+
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
+
+/* operations shared over more than one file */
+int coda_open(struct inode *i, struct file *f);
+int coda_release(struct inode *i, struct file *f);
+int coda_permission(struct inode *inode, int mask, unsigned int flags);
+int coda_revalidate_inode(struct dentry *);
+int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+int coda_setattr(struct dentry *, struct iattr *);
+
+/* this file: heloers */
+char *coda_f2s(struct CodaFid *f);
+int coda_isroot(struct inode *i);
+int coda_iscontrol(const char *name, size_t length);
+
+void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
+void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
+unsigned short coda_flags_to_cflags(unsigned short);
+
+/* sysctl.h */
+void coda_sysctl_init(void);
+void coda_sysctl_clean(void);
+
+#define CODA_ALLOC(ptr, cast, size) do { \
+ if (size < PAGE_SIZE) \
+ ptr = kmalloc((unsigned long) size, GFP_KERNEL); \
+ else \
+ ptr = (cast)vmalloc((unsigned long) size); \
+ if (!ptr) \
+ printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+ else memset( ptr, 0, size ); \
+} while (0)
+
+
+#define CODA_FREE(ptr,size) \
+ do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+
+/* inode to cnode access functions */
+
+static inline struct coda_inode_info *ITOC(struct inode *inode)
+{
+ return list_entry(inode, struct coda_inode_info, vfs_inode);
+}
+
+static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
+{
+ return &(ITOC(inode)->c_fid);
+}
+
+static __inline__ char *coda_i2s(struct inode *inode)
+{
+ return coda_f2s(&(ITOC(inode)->c_fid));
+}
+
+/* this will not zap the inode away */
+static __inline__ void coda_flag_inode(struct inode *inode, int flag)
+{
+ struct coda_inode_info *cii = ITOC(inode);
+
+ spin_lock(&cii->c_lock);
+ cii->c_flags |= flag;
+ spin_unlock(&cii->c_lock);
+}
+
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 29badd91360f..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,10 +23,9 @@
#include <asm/uaccess.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
#include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
#include "coda_int.h"
@@ -61,7 +60,7 @@ static int coda_return_EIO(void)
}
#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-static const struct dentry_operations coda_dentry_operations =
+const struct dentry_operations coda_dentry_operations =
{
.d_revalidate = coda_dentry_revalidate,
.d_delete = coda_dentry_delete,
@@ -126,8 +125,6 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
return ERR_PTR(error);
exit:
- d_set_d_op(entry, &coda_dentry_operations);
-
if (inode && (type & CODA_NOCACHE))
coda_flag_inode(inode, C_VATTR | C_PURGE);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index c8b50ba4366a..0433057be330 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -21,10 +21,9 @@
#include <asm/uaccess.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
#include <linux/coda_psdev.h>
+#include "coda_linux.h"
#include "coda_int.h"
static ssize_t
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 50dc7d189f56..871b27715465 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -28,10 +28,9 @@
#include <linux/vmalloc.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
#include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
#include "coda_int.h"
@@ -45,7 +44,7 @@ static struct kmem_cache * coda_inode_cachep;
static struct inode *coda_alloc_inode(struct super_block *sb)
{
struct coda_inode_info *ei;
- ei = (struct coda_inode_info *)kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
@@ -193,6 +192,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
sb->s_blocksize_bits = 12;
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
+ sb->s_d_op = &coda_dentry_operations;
sb->s_bdi = &vc->bdi;
/* get root fid from Venus: this needs the root inode */
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 741f0bd03918..6cbb3afb36dc 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -19,10 +19,10 @@
#include <asm/uaccess.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+
/* pioctl ops */
static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags);
static long coda_pioctl(struct file *filp, unsigned int cmd,
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 62647a8595e4..8f616e0e252c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -43,10 +43,10 @@
#include <asm/uaccess.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
-#include <linux/coda_fs_i.h>
#include <linux/coda_psdev.h>
+#include "coda_linux.h"
+
#include "coda_int.h"
/* statistics */
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index af78f007a2b0..ab94ef63caef 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -16,9 +16,9 @@
#include <linux/pagemap.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
#include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
+
+#include "coda_linux.h"
static int coda_symlink_filler(struct file *file, struct page *page)
{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c3563cab9758..9727e0c52579 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,10 +33,9 @@
#include <linux/vfs.h>
#include <linux/coda.h>
-#include <linux/coda_linux.h>
#include <linux/coda_psdev.h>
-#include <linux/coda_fs_i.h>
-#include <linux/coda_cache.h>
+#include "coda_linux.h"
+#include "coda_cache.h"
#include "coda_int.h"
diff --git a/fs/compat.c b/fs/compat.c
index eb1740ac8c0a..f6fd0a00e6cc 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -257,7 +257,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
}
/*
- * The following statfs calls are copies of code from fs/open.c and
+ * The following statfs calls are copies of code from fs/statfs.c and
* should be checked against those from time to time
*/
asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
@@ -320,7 +320,9 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
__put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
__put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
__put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize))
+ __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
return -EFAULT;
return 0;
}
@@ -597,10 +599,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
if (nr_segs > fast_segs) {
ret = -ENOMEM;
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL) {
- *ret_pointer = fast_pointer;
+ if (iov == NULL)
goto out;
- }
}
*ret_pointer = iov;
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 13587cc97a0b..9febcdefdfdc 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,8 +1,8 @@
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
- depends on SYSFS
+ select SYSFS
help
- configfs is a ram-based filesystem that provides the converse
+ configfs is a RAM-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based
view of kernel objects, configfs is a filesystem-based manager
of kernel objects, or config_items.
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 026cf68553a4..82bda8fdfc1c 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -90,6 +90,7 @@ extern const struct file_operations configfs_file_operations;
extern const struct file_operations bin_fops;
extern const struct inode_operations configfs_dir_inode_operations;
extern const struct inode_operations configfs_symlink_inode_operations;
+extern const struct dentry_operations configfs_dentry_ops;
extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 36637a8c1ed3..90ff3cb10de3 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(const struct dentry *dentry)
return 1;
}
-static const struct dentry_operations configfs_dentry_ops = {
+const struct dentry_operations configfs_dentry_ops = {
.d_iput = configfs_d_iput,
/* simple_delete_dentry() isn't exported */
.d_delete = configfs_d_delete,
@@ -442,7 +442,6 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
return error;
}
- d_set_d_op(dentry, &configfs_dentry_ops);
d_rehash(dentry);
return 0;
@@ -489,7 +488,6 @@ static struct dentry * configfs_lookup(struct inode *dir,
*/
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- d_set_d_op(dentry, &configfs_dentry_ops);
d_add(dentry, NULL);
return NULL;
}
@@ -683,7 +681,6 @@ static int create_default_group(struct config_group *parent_group,
ret = -ENOMEM;
child = d_alloc(parent, &name);
if (child) {
- d_set_d_op(child, &configfs_dentry_ops);
d_add(child, NULL);
ret = configfs_attach_group(&parent_group->cg_item,
@@ -1681,7 +1678,6 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
err = -ENOMEM;
dentry = d_alloc(configfs_sb->s_root, &name);
if (dentry) {
- d_set_d_op(dentry, &configfs_dentry_ops);
d_add(dentry, NULL);
err = configfs_attach_group(sd->s_element, &group->cg_item,
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7d3607febe1c..ecc62178beda 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -101,6 +101,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
configfs_root_group.cg_item.ci_dentry = root;
root->d_fsdata = &configfs_root;
sb->s_root = root;
+ sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
return 0;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 32fd5fe9ca0e..e141939080f0 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -34,57 +34,81 @@ static const struct address_space_operations cramfs_aops;
static DEFINE_MUTEX(read_mutex);
-/* These two macros may change in future, to provide better st_ino
- semantics. */
-#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1)
+/* These macros may change in future, to provide better st_ino semantics. */
#define OFFSET(x) ((x)->i_ino)
-static void setup_inode(struct inode *inode, struct cramfs_inode * cramfs_inode)
+static unsigned long cramino(struct cramfs_inode *cino, unsigned int offset)
{
+ if (!cino->offset)
+ return offset + 1;
+ if (!cino->size)
+ return offset + 1;
+
+ /*
+ * The file mode test fixes buggy mkcramfs implementations where
+ * cramfs_inode->offset is set to a non zero value for entries
+ * which did not contain data, like devices node and fifos.
+ */
+ switch (cino->mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ return cino->offset << 2;
+ default:
+ break;
+ }
+ return offset + 1;
+}
+
+static struct inode *get_cramfs_inode(struct super_block *sb,
+ struct cramfs_inode *cramfs_inode, unsigned int offset)
+{
+ struct inode *inode;
static struct timespec zerotime;
+
+ inode = iget_locked(sb, cramino(cramfs_inode, offset));
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ switch (cramfs_inode->mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_fop = &generic_ro_fops;
+ inode->i_data.a_ops = &cramfs_aops;
+ break;
+ case S_IFDIR:
+ inode->i_op = &cramfs_dir_inode_operations;
+ inode->i_fop = &cramfs_directory_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_data.a_ops = &cramfs_aops;
+ break;
+ default:
+ init_special_inode(inode, cramfs_inode->mode,
+ old_decode_dev(cramfs_inode->size));
+ }
+
inode->i_mode = cramfs_inode->mode;
inode->i_uid = cramfs_inode->uid;
- inode->i_size = cramfs_inode->size;
- inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
inode->i_gid = cramfs_inode->gid;
+
+ /* if the lower 2 bits are zero, the inode contains data */
+ if (!(inode->i_ino & 3)) {
+ inode->i_size = cramfs_inode->size;
+ inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+ }
+
/* Struct copy intentional */
inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
/* inode->i_nlink is left 1 - arguably wrong for directories,
but it's the best we can do without reading the directory
contents. 1 yields the right result in GNU find, even
without -noleaf option. */
- if (S_ISREG(inode->i_mode)) {
- inode->i_fop = &generic_ro_fops;
- inode->i_data.a_ops = &cramfs_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &cramfs_dir_inode_operations;
- inode->i_fop = &cramfs_directory_operations;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &page_symlink_inode_operations;
- inode->i_data.a_ops = &cramfs_aops;
- } else {
- init_special_inode(inode, inode->i_mode,
- old_decode_dev(cramfs_inode->size));
- }
-}
-static struct inode *get_cramfs_inode(struct super_block *sb,
- struct cramfs_inode * cramfs_inode)
-{
- struct inode *inode;
- if (CRAMINO(cramfs_inode) == 1) {
- inode = new_inode(sb);
- if (inode) {
- inode->i_ino = 1;
- setup_inode(inode, cramfs_inode);
- }
- } else {
- inode = iget_locked(sb, CRAMINO(cramfs_inode));
- if (inode && (inode->i_state & I_NEW)) {
- setup_inode(inode, cramfs_inode);
- unlock_new_inode(inode);
- }
- }
+ unlock_new_inode(inode);
+
return inode;
}
@@ -265,6 +289,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_ERR "cramfs: root is not a directory\n");
goto out;
}
+ /* correct strange, hard-coded permissions of mkcramfs */
+ super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+
root_offset = super.root.offset << 2;
if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
sbi->size=super.size;
@@ -289,7 +316,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
/* Set it all up.. */
sb->s_op = &cramfs_ops;
- root = get_cramfs_inode(sb, &super.root);
+ root = get_cramfs_inode(sb, &super.root, 0);
if (!root)
goto out;
sb->s_root = d_alloc_root(root);
@@ -365,7 +392,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
*/
namelen = de->namelen << 2;
memcpy(buf, name, namelen);
- ino = CRAMINO(de);
+ ino = cramino(de, OFFSET(inode) + offset);
mode = de->mode;
mutex_unlock(&read_mutex);
nextoffset = offset + sizeof(*de) + namelen;
@@ -404,8 +431,9 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
struct cramfs_inode *de;
char *name;
int namelen, retval;
+ int dir_off = OFFSET(dir) + offset;
- de = cramfs_read(dir->i_sb, OFFSET(dir) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
+ de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN);
name = (char *)(de+1);
/* Try to take advantage of sorted directories */
@@ -436,7 +464,7 @@ static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, s
if (!retval) {
struct cramfs_inode entry = *de;
mutex_unlock(&read_mutex);
- d_add(dentry, get_cramfs_inode(dir->i_sb, &entry));
+ d_add(dentry, get_cramfs_inode(dir->i_sb, &entry, dir_off));
return NULL;
}
/* else (retval < 0) */
diff --git a/fs/dcache.c b/fs/dcache.c
index 5699d4c027cb..9f493ee4dcba 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1320,6 +1320,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
__dget_dlock(parent);
dentry->d_parent = parent;
dentry->d_sb = parent->d_sb;
+ d_set_d_op(dentry, dentry->d_sb->s_d_op);
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
spin_unlock(&parent->d_lock);
}
@@ -1335,6 +1336,7 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
struct dentry *dentry = d_alloc(NULL, name);
if (dentry) {
dentry->d_sb = sb;
+ d_set_d_op(dentry, dentry->d_sb->s_d_op);
dentry->d_parent = dentry;
dentry->d_flags |= DCACHE_DISCONNECTED;
}
@@ -1355,8 +1357,8 @@ EXPORT_SYMBOL(d_alloc_name);
void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
- BUG_ON(dentry->d_op);
- BUG_ON(dentry->d_flags & (DCACHE_OP_HASH |
+ WARN_ON_ONCE(dentry->d_op);
+ WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
DCACHE_OP_DELETE ));
@@ -1378,8 +1380,11 @@ EXPORT_SYMBOL(d_set_d_op);
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
spin_lock(&dentry->d_lock);
- if (inode)
+ if (inode) {
+ if (unlikely(IS_AUTOMOUNT(inode)))
+ dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
list_add(&dentry->d_alias, &inode->i_dentry);
+ }
dentry->d_inode = inode;
dentry_rcuwalk_barrier(dentry);
spin_unlock(&dentry->d_lock);
@@ -1507,6 +1512,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
res = d_alloc(NULL, &name);
if (res) {
res->d_sb = root_inode->i_sb;
+ d_set_d_op(res, res->d_sb->s_d_op);
res->d_parent = res;
d_instantiate(res, root_inode);
}
@@ -1567,6 +1573,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
/* attach a disconnected dentry */
spin_lock(&tmp->d_lock);
tmp->d_sb = inode->i_sb;
+ d_set_d_op(tmp, tmp->d_sb->s_d_op);
tmp->d_inode = inode;
tmp->d_flags |= DCACHE_DISCONNECTED;
list_add(&tmp->d_alias, &inode->i_dentry);
@@ -1966,7 +1973,7 @@ out:
/**
* d_validate - verify dentry provided from insecure source (deprecated)
* @dentry: The dentry alleged to be valid child of @dparent
- * @dparent: The parent dentry (known to be valid)
+ * @parent: The parent dentry (known to be valid)
*
* An insecure source has sent us a dentry, here we verify it and dget() it.
* This is used by ncpfs in its readdir implementation.
@@ -2449,8 +2456,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
}
/**
- * Prepend path string to a buffer
- *
+ * prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry (may be modified by this function)
* @buffer: pointer to the end of the buffer
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f7..b044705eedd4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -325,12 +325,16 @@ void dio_end_io(struct bio *bio, int error)
}
EXPORT_SYMBOL_GPL(dio_end_io);
-static int
+static void
dio_bio_alloc(struct dio *dio, struct block_device *bdev,
sector_t first_sector, int nr_vecs)
{
struct bio *bio;
+ /*
+ * bio_alloc() is guaranteed to return a bio when called with
+ * __GFP_WAIT and we request a valid number of vectors.
+ */
bio = bio_alloc(GFP_KERNEL, nr_vecs);
bio->bi_bdev = bdev;
@@ -342,7 +346,6 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
dio->bio = bio;
dio->logical_offset_in_bio = dio->cur_page_fs_offset;
- return 0;
}
/*
@@ -583,8 +586,9 @@ static int dio_new_bio(struct dio *dio, sector_t start_sector)
goto out;
sector = start_sector << (dio->blkbits - 9);
nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+ nr_pages = min(nr_pages, BIO_MAX_PAGES);
BUG_ON(nr_pages <= 0);
- ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
+ dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
dio->boundary = 0;
out:
return ret;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 2dbb422e8116..1897eb1b4b6a 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,8 +1,7 @@
menuconfig DLM
tristate "Distributed Lock Manager (DLM)"
depends on EXPERIMENTAL && INET
- depends on SYSFS && (IPV6 || IPV6=n)
- select CONFIGFS_FS
+ depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
help
A general purpose distributed lock manager for kernel or userspace
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cbadc1bee6e7..bfd8b680e648 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -348,7 +348,7 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
BUG_ON(!crypt_stat || !crypt_stat->tfm
|| !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+ ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
crypt_stat->key_size);
ecryptfs_dump_hex(crypt_stat->key,
crypt_stat->key_size);
@@ -413,10 +413,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to "
- "derive IV for extent [0x%.16x]; "
- "rc = [%d]\n", (extent_base + extent_offset),
- rc);
+ ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+ "extent [0x%.16llx]; rc = [%d]\n",
+ (unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
if (unlikely(ecryptfs_verbosity > 0)) {
@@ -443,9 +442,9 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
}
rc = 0;
if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
- "rc = [%d]\n", (extent_base + extent_offset),
- rc);
+ ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
+ "rc = [%d]\n",
+ (unsigned long long)(extent_base + extent_offset), rc);
ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
"encryption:\n");
ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
@@ -540,10 +539,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to "
- "derive IV for extent [0x%.16x]; "
- "rc = [%d]\n", (extent_base + extent_offset),
- rc);
+ ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
+ "extent [0x%.16llx]; rc = [%d]\n",
+ (unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
if (unlikely(ecryptfs_verbosity > 0)) {
@@ -571,9 +569,9 @@ static int ecryptfs_decrypt_extent(struct page *page,
}
rc = 0;
if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16x]; "
- "rc = [%d]\n", (extent_base + extent_offset),
- rc);
+ ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
+ "rc = [%d]\n",
+ (unsigned long long)(extent_base + extent_offset), rc);
ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
"decryption:\n");
ecryptfs_dump_hex((char *)(page_address(page)
@@ -780,7 +778,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
}
ecryptfs_printk(KERN_DEBUG,
"Initializing cipher [%s]; strlen = [%d]; "
- "key_size_bits = [%d]\n",
+ "key_size_bits = [%zd]\n",
crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
crypt_stat->key_size << 3);
if (crypt_stat->tfm) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 413a3c48f0bb..dbc84ed96336 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -192,7 +192,6 @@ ecryptfs_get_key_payload_data(struct key *key)
(((struct user_key_payload*)key->payload.data)->data);
}
-#define ECRYPTFS_SUPER_MAGIC 0xf15f
#define ECRYPTFS_MAX_KEYSET_SIZE 1024
#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
@@ -584,6 +583,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
#define ecryptfs_printk(type, fmt, arg...) \
__ecryptfs_printk(type "%s: " fmt, __func__, ## arg);
+__attribute__ ((format(printf, 1, 2)))
void __ecryptfs_printk(const char *fmt, ...);
extern const struct file_operations ecryptfs_main_fops;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 91da02987bff..81e10e6a9443 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -47,7 +47,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- int rc;
+ ssize_t rc;
struct dentry *lower_dentry;
struct vfsmount *lower_vfsmount;
struct file *file = iocb->ki_filp;
@@ -191,18 +191,16 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
| ECRYPTFS_ENCRYPTED);
}
mutex_unlock(&crypt_stat->cs_mutex);
- if (!ecryptfs_inode_to_private(inode)->lower_file) {
- rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to initialize "
- "the persistent file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
- goto out_free;
- }
+ rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the persistent file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ ecryptfs_dentry->d_name.name, rc);
+ goto out_free;
}
- if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
- && !(file->f_flags & O_RDONLY)) {
+ if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
+ == O_RDONLY && (file->f_flags & O_ACCMODE) != O_RDONLY) {
rc = -EPERM;
printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
"file must hence be opened RO\n", __func__);
@@ -243,9 +241,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
}
}
mutex_unlock(&crypt_stat->cs_mutex);
- ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
- "size: [0x%.16x]\n", inode, inode->i_ino,
- i_size_read(inode));
+ ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
+ "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
+ (unsigned long long)i_size_read(inode));
goto out;
out_free:
kmem_cache_free(ecryptfs_file_info_cache,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 337352a94751..bd33f87a1907 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -185,15 +185,13 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
"context; rc = [%d]\n", rc);
goto out;
}
- if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
- rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to initialize "
- "the persistent file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
- goto out;
- }
+ rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the persistent file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ ecryptfs_dentry->d_name.name, rc);
+ goto out;
}
rc = ecryptfs_write_metadata(ecryptfs_dentry);
if (rc) {
@@ -302,15 +300,13 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
rc = -ENOMEM;
goto out;
}
- if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
- rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to initialize "
- "the persistent file for the dentry with name "
- "[%s]; rc = [%d]\n", __func__,
- ecryptfs_dentry->d_name.name, rc);
- goto out_free_kmem;
- }
+ rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the persistent file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ ecryptfs_dentry->d_name.name, rc);
+ goto out_free_kmem;
}
crypt_stat = &ecryptfs_inode_to_private(
ecryptfs_dentry->d_inode)->crypt_stat;
@@ -441,7 +437,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
struct qstr lower_name;
int rc = 0;
- d_set_d_op(ecryptfs_dentry, &ecryptfs_dops);
if ((ecryptfs_dentry->d_name.len == 1
&& !strcmp(ecryptfs_dentry->d_name.name, "."))
|| (ecryptfs_dentry->d_name.len == 2
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index b1f6858a5223..c1436cff6f2d 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -59,7 +59,7 @@ static int process_request_key_err(long err_code)
break;
default:
ecryptfs_printk(KERN_WARNING, "Unknown error code: "
- "[0x%.16x]\n", err_code);
+ "[0x%.16lx]\n", err_code);
rc = -EINVAL;
}
return rc;
@@ -130,7 +130,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
} else {
rc = -EINVAL;
ecryptfs_printk(KERN_WARNING,
- "Unsupported packet size: [%d]\n", size);
+ "Unsupported packet size: [%zd]\n", size);
}
return rc;
}
@@ -1672,7 +1672,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
auth_tok->session_key.decrypted_key_size);
crypt_stat->flags |= ECRYPTFS_KEY_VALID;
if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "FEK of size [%d]:\n",
+ ecryptfs_printk(KERN_DEBUG, "FEK of size [%zd]:\n",
crypt_stat->key_size);
ecryptfs_dump_hex(crypt_stat->key,
crypt_stat->key_size);
@@ -1754,7 +1754,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
ecryptfs_printk(KERN_ERR, "Expected "
"signature of size [%d]; "
- "read size [%d]\n",
+ "read size [%zd]\n",
ECRYPTFS_SIG_SIZE,
tag_11_contents_size);
rc = -EIO;
@@ -1787,8 +1787,8 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
goto out_wipe_list;
break;
default:
- ecryptfs_printk(KERN_DEBUG, "No packet at offset "
- "[%d] of the file header; hex value of "
+ ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
+ "of the file header; hex value of "
"character is [0x%.2x]\n", i, src[i]);
next_packet_is_auth_tok_packet = 0;
}
@@ -1864,8 +1864,8 @@ found_matching_auth_tok:
"session key for authentication token with sig "
"[%.*s]; rc = [%d]. Removing auth tok "
"candidate from the list and searching for "
- "the next match.\n", candidate_auth_tok_sig,
- ECRYPTFS_SIG_SIZE_HEX, rc);
+ "the next match.\n", ECRYPTFS_SIG_SIZE_HEX,
+ candidate_auth_tok_sig, rc);
list_for_each_entry_safe(auth_tok_list_item,
auth_tok_list_item_tmp,
&auth_tok_list, list) {
@@ -2168,7 +2168,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
if (encrypted_session_key_valid) {
ecryptfs_printk(KERN_DEBUG, "encrypted_session_key_valid != 0; "
"using auth_tok->session_key.encrypted_key, "
- "where key_rec->enc_key_size = [%d]\n",
+ "where key_rec->enc_key_size = [%zd]\n",
key_rec->enc_key_size);
memcpy(key_rec->enc_key,
auth_tok->session_key.encrypted_key,
@@ -2198,7 +2198,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
if (rc < 1 || rc > 2) {
ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
"for crypt_stat session key; expected rc = 1; "
- "got rc = [%d]. key_rec->enc_key_size = [%d]\n",
+ "got rc = [%d]. key_rec->enc_key_size = [%zd]\n",
rc, key_rec->enc_key_size);
rc = -ENOMEM;
goto out;
@@ -2209,7 +2209,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
"for crypt_stat encrypted session key; "
"expected rc = 1; got rc = [%d]. "
- "key_rec->enc_key_size = [%d]\n", rc,
+ "key_rec->enc_key_size = [%zd]\n", rc,
key_rec->enc_key_size);
rc = -ENOMEM;
goto out;
@@ -2224,7 +2224,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
goto out;
}
rc = 0;
- ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+ ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
crypt_stat->key_size);
rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
(*key_rec).enc_key_size);
@@ -2235,7 +2235,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
}
ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
if (ecryptfs_verbosity > 0) {
- ecryptfs_printk(KERN_DEBUG, "EFEK of size [%d]:\n",
+ ecryptfs_printk(KERN_DEBUG, "EFEK of size [%zd]:\n",
key_rec->enc_key_size);
ecryptfs_dump_hex(key_rec->enc_key,
key_rec->enc_key_size);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 351038675376..758323a0f09a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -36,6 +36,7 @@
#include <linux/parser.h>
#include <linux/fs_stack.h>
#include <linux/slab.h>
+#include <linux/magic.h>
#include "ecryptfs_kernel.h"
/**
@@ -141,25 +142,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
return rc;
}
-/**
- * ecryptfs_interpose
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
- struct super_block *sb, u32 flags)
+static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+ struct super_block *sb)
{
- struct inode *lower_inode;
struct inode *inode;
int rc = 0;
- lower_inode = lower_dentry->d_inode;
if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
rc = -EXDEV;
goto out;
@@ -189,17 +177,38 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
if (special_file(lower_inode->i_mode))
init_special_inode(inode, lower_inode->i_mode,
lower_inode->i_rdev);
- d_set_d_op(dentry, &ecryptfs_dops);
fsstack_copy_attr_all(inode, lower_inode);
/* This size will be overwritten for real files w/ headers and
* other metadata */
fsstack_copy_inode_size(inode, lower_inode);
+ return inode;
+out:
+ return ERR_PTR(rc);
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flags: flags to govern behavior of interpose procedure
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+ struct super_block *sb, u32 flags)
+{
+ struct inode *lower_inode = lower_dentry->d_inode;
+ struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
d_add(dentry, inode);
else
d_instantiate(dentry, inode);
-out:
- return rc;
+ return 0;
}
enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
@@ -492,59 +501,11 @@ struct kmem_cache *ecryptfs_sb_info_cache;
static struct file_system_type ecryptfs_fs_type;
/**
- * ecryptfs_read_super
- * @sb: The ecryptfs super block
- * @dev_name: The path to mount over
- *
- * Read the super block of the lower filesystem, and use
- * ecryptfs_interpose to create our initial inode and super block
- * struct.
- */
-static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
-{
- struct path path;
- int rc;
-
- rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
- goto out;
- }
- if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
- rc = -EINVAL;
- printk(KERN_ERR "Mount on filesystem of type "
- "eCryptfs explicitly disallowed due to "
- "known incompatibilities\n");
- goto out_free;
- }
- ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
- sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
- sb->s_blocksize = path.dentry->d_sb->s_blocksize;
- ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
- ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
- rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
- if (rc)
- goto out_free;
- rc = 0;
- goto out;
-out_free:
- path_put(&path);
-out:
- return rc;
-}
-
-/**
* ecryptfs_get_sb
* @fs_type
* @flags
* @dev_name: The path to mount over
* @raw_data: The options passed into the kernel
- *
- * The whole ecryptfs_get_sb process is broken into 3 functions:
- * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_read_super(): this accesses the lower filesystem and uses
- * ecryptfs_interpose to perform most of the linking
- * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
*/
static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *raw_data)
@@ -553,6 +514,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
struct ecryptfs_sb_info *sbi;
struct ecryptfs_dentry_info *root_info;
const char *err = "Getting sb failed";
+ struct inode *inode;
+ struct path path;
int rc;
sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -575,10 +538,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
s->s_flags = flags;
rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
- if (rc) {
- deactivate_locked_super(s);
- goto out;
- }
+ if (rc)
+ goto out1;
ecryptfs_set_superblock_private(s, sbi);
s->s_bdi = &sbi->bdi;
@@ -586,34 +547,55 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
s->s_op = &ecryptfs_sops;
+ s->s_d_op = &ecryptfs_dops;
- rc = -ENOMEM;
- s->s_root = d_alloc(NULL, &(const struct qstr) {
- .hash = 0,.name = "/",.len = 1});
+ err = "Reading sb failed";
+ rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "kern_path() failed\n");
+ goto out1;
+ }
+ if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Mount on filesystem of type "
+ "eCryptfs explicitly disallowed due to "
+ "known incompatibilities\n");
+ goto out_free;
+ }
+ ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+ s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+ s->s_blocksize = path.dentry->d_sb->s_blocksize;
+ s->s_magic = ECRYPTFS_SUPER_MAGIC;
+
+ inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+ rc = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_free;
+
+ s->s_root = d_alloc_root(inode);
if (!s->s_root) {
- deactivate_locked_super(s);
- goto out;
+ iput(inode);
+ rc = -ENOMEM;
+ goto out_free;
}
- d_set_d_op(s->s_root, &ecryptfs_dops);
- s->s_root->d_sb = s;
- s->s_root->d_parent = s->s_root;
+ rc = -ENOMEM;
root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
- if (!root_info) {
- deactivate_locked_super(s);
- goto out;
- }
+ if (!root_info)
+ goto out_free;
+
/* ->kill_sb() will take care of root_info */
ecryptfs_set_dentry_private(s->s_root, root_info);
+ ecryptfs_set_dentry_lower(s->s_root, path.dentry);
+ ecryptfs_set_dentry_lower_mnt(s->s_root, path.mnt);
+
s->s_flags |= MS_ACTIVE;
- rc = ecryptfs_read_super(s, dev_name);
- if (rc) {
- deactivate_locked_super(s);
- err = "Reading sb failed";
- goto out;
- }
return dget(s->s_root);
+out_free:
+ path_put(&path);
+out1:
+ deactivate_locked_super(s);
out:
if (sbi) {
ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
@@ -828,9 +810,10 @@ static int __init ecryptfs_init(void)
ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
"larger than the host's page size, and so "
"eCryptfs cannot run on this system. The "
- "default eCryptfs extent size is [%d] bytes; "
- "the page size is [%d] bytes.\n",
- ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+ "default eCryptfs extent size is [%u] bytes; "
+ "the page size is [%lu] bytes.\n",
+ ECRYPTFS_DEFAULT_EXTENT_SIZE,
+ (unsigned long)PAGE_CACHE_SIZE);
goto out;
}
rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index b1d82756544b..cc64fca89f8d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -65,7 +65,7 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
rc = ecryptfs_encrypt_page(page);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting "
- "page (upper index [0x%.16x])\n", page->index);
+ "page (upper index [0x%.16lx])\n", page->index);
ClearPageUptodate(page);
goto out;
}
@@ -237,7 +237,7 @@ out:
ClearPageUptodate(page);
else
SetPageUptodate(page);
- ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+ ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
page->index);
unlock_page(page);
return rc;
@@ -290,6 +290,7 @@ static int ecryptfs_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
+ prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
if (!PageUptodate(page)) {
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
@@ -335,18 +336,23 @@ static int ecryptfs_write_begin(struct file *file,
SetPageUptodate(page);
}
} else {
- rc = ecryptfs_decrypt_page(page);
- if (rc) {
- printk(KERN_ERR "%s: Error decrypting page "
- "at index [%ld]; rc = [%d]\n",
- __func__, page->index, rc);
- ClearPageUptodate(page);
- goto out;
+ if (prev_page_end_size
+ >= i_size_read(page->mapping->host)) {
+ zero_user(page, 0, PAGE_CACHE_SIZE);
+ } else {
+ rc = ecryptfs_decrypt_page(page);
+ if (rc) {
+ printk(KERN_ERR "%s: Error decrypting "
+ "page at index [%ld]; "
+ "rc = [%d]\n",
+ __func__, page->index, rc);
+ ClearPageUptodate(page);
+ goto out;
+ }
}
SetPageUptodate(page);
}
}
- prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
/* If creating a page or more of holes, zero them out via truncate.
* Note, this will increase i_size. */
if (index != 0) {
@@ -488,7 +494,7 @@ static int ecryptfs_write_end(struct file *file,
} else
ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
- "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
+ "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
to);
@@ -503,19 +509,20 @@ static int ecryptfs_write_end(struct file *file,
rc = fill_zeros_to_end_of_page(page, to);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
- "zeros in page with index = [0x%.16x]\n", index);
+ "zeros in page with index = [0x%.16lx]\n", index);
goto out;
}
rc = ecryptfs_encrypt_page(page);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
- "index [0x%.16x])\n", index);
+ "index [0x%.16lx])\n", index);
goto out;
}
if (pos + copied > i_size_read(ecryptfs_inode)) {
i_size_write(ecryptfs_inode, pos + copied);
ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
- "[0x%.16x]\n", i_size_read(ecryptfs_inode));
+ "[0x%.16llx]\n",
+ (unsigned long long)i_size_read(ecryptfs_inode));
}
rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
if (rc)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 8cf07242067d..cc8a9b7d6064 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -217,7 +217,7 @@ struct ep_send_events_data {
* Configuration options available inside /proc/sys/fs/epoll/
*/
/* Maximum number of epoll watched descriptors, per user */
-static int max_user_watches __read_mostly;
+static long max_user_watches __read_mostly;
/*
* This mutex is used to serialize ep_free() and eventpoll_release_file().
@@ -240,16 +240,18 @@ static struct kmem_cache *pwq_cache __read_mostly;
#include <linux/sysctl.h>
-static int zero;
+static long zero;
+static long long_max = LONG_MAX;
ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(max_user_watches),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_doulongvec_minmax,
.extra1 = &zero,
+ .extra2 = &long_max,
},
{ }
};
@@ -561,7 +563,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
/* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi);
- atomic_dec(&ep->user->epoll_watches);
+ atomic_long_dec(&ep->user->epoll_watches);
return 0;
}
@@ -898,11 +900,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
{
int error, revents, pwake = 0;
unsigned long flags;
+ long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
- if (unlikely(atomic_read(&ep->user->epoll_watches) >=
- max_user_watches))
+ user_watches = atomic_long_read(&ep->user->epoll_watches);
+ if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
@@ -966,7 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
spin_unlock_irqrestore(&ep->lock, flags);
- atomic_inc(&ep->user->epoll_watches);
+ atomic_long_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */
if (pwake)
@@ -1426,6 +1429,7 @@ static int __init eventpoll_init(void)
*/
max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
EP_ITEM_COST;
+ BUG_ON(max_user_watches < 0);
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0e0d391626be..85c8cc8f2473 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -364,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
goto fail;
return bdev;
@@ -381,8 +381,7 @@ fail:
*/
static int ext3_blkdev_put(struct block_device *bdev)
{
- bd_release(bdev);
- return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -2162,13 +2161,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
if (bdev == NULL)
return NULL;
- if (bd_claim(bdev, sb)) {
- ext3_msg(sb, KERN_ERR,
- "error: failed to claim external journal device");
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
- return NULL;
- }
-
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bab2387fba43..0c8d97b56f34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -561,7 +561,7 @@ struct ext4_new_group_data {
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
-/* Max physical block we can addres w/o extents */
+/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
/*
@@ -2065,7 +2065,7 @@ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
-extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e910720e8bb8..63a75810b7c3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2845,14 +2845,14 @@ fix_extent_len:
* to an uninitialized extent.
*
* Writing to an uninitized extent may result in splitting the uninitialized
- * extent into multiple /intialized unintialized extents (up to three)
+ * extent into multiple /initialized uninitialized extents (up to three)
* There are three possibilities:
* a> There is no split required: Entire extent should be uninitialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
* One of more index blocks maybe needed if the extent tree grow after
- * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
* complete, we need to split the uninitialized extent before DIO submit
* the IO. The uninitialized extent called at this time will be split
* into three uninitialized extent(at most). After IO complete, the part
@@ -3627,14 +3627,15 @@ static void ext4_falloc_update_inode(struct inode *inode,
}
/*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
* operation, which gets called from sys_fallocate system call.
* For block-mapped files, posix_fallocate should fall back to the method
* of writing zeroes to the required new blocks (the same behavior which is
* expected for file systems which do not support fallocate() system call).
*/
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
+ struct inode *inode = file->f_path.dentry->d_inode;
handle_t *handle;
loff_t new_size;
unsigned int max_blocks;
@@ -3644,6 +3645,10 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
@@ -3651,10 +3656,6 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
- /* preallocation to directories is currently not supported */
- if (S_ISDIR(inode->i_mode))
- return -ENODEV;
-
map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bb003dc9ffff..2e8322c8aa88 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -210,6 +210,7 @@ const struct file_operations ext4_file_operations = {
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
+ .fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
@@ -223,7 +224,6 @@ const struct inode_operations ext4_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.check_acl = ext4_check_acl,
- .fallocate = ext4_fallocate,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e80fc513eacc..9f7f9e49914f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
@@ -3379,7 +3380,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* doing I/O at all.
*
* We could call write_cache_pages(), and then redirty all of
- * the pages by calling redirty_page_for_writeback() but that
+ * the pages by calling redirty_page_for_writepage() but that
* would be ugly in the extreme. So instead we would need to
* replicate parts of the code in the above functions,
* simplifying them becuase we wouldn't actually intend to
@@ -3737,7 +3738,7 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
retry:
io_end = ext4_init_io_end(inode, GFP_ATOMIC);
if (!io_end) {
- pr_warning_ratelimited("%s: allocation fail\n", __func__);
+ pr_warn_ratelimited("%s: allocation fail\n", __func__);
schedule();
goto retry;
}
@@ -3761,9 +3762,9 @@ retry:
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
- * For holes, we fallocate those blocks, mark them as unintialized
+ * For holes, we fallocate those blocks, mark them as uninitialized
* If those blocks were preallocated, we mark sure they are splited, but
- * still keep the range to write as unintialized.
+ * still keep the range to write as uninitialized.
*
* The unwrritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0f10ccd6bfc0..48ce561fafac 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -657,7 +657,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
goto fail;
return bdev;
@@ -673,8 +673,7 @@ fail:
*/
static int ext4_blkdev_put(struct block_device *bdev)
{
- bd_release(bdev);
- return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -3778,13 +3777,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if (bdev == NULL)
return NULL;
- if (bd_claim(bdev, sb)) {
- ext4_msg(sb, KERN_ERR,
- "failed to claim external journal device");
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
- return NULL;
- }
-
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index d75a77f85c28..f50408901f7e 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,7 +319,8 @@ extern struct inode *fat_build_inode(struct super_block *sb,
struct msdos_dir_entry *de, loff_t i_pos);
extern int fat_sync_inode(struct inode *inode);
extern int fat_fill_super(struct super_block *sb, void *data, int silent,
- const struct inode_operations *fs_dir_inode_ops, int isvfat);
+ const struct inode_operations *fs_dir_inode_ops,
+ int isvfat, void (*setup)(struct super_block *));
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 206351af7c58..86753fe10bd1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -703,7 +703,6 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
struct inode *inode = NULL;
- struct dentry *result;
u32 *fh = fid->raw;
if (fh_len < 5 || fh_type != 3)
@@ -748,10 +747,7 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
* the fat_iget lookup again. If that fails, then we are totally out
* of luck. But all that is for another day
*/
- result = d_obtain_alias(inode);
- if (!IS_ERR(result))
- d_set_d_op(result, sb->s_root->d_op);
- return result;
+ return d_obtain_alias(inode);
}
static int
@@ -799,8 +795,6 @@ static struct dentry *fat_get_parent(struct dentry *child)
brelse(bh);
parent = d_obtain_alias(inode);
- if (!IS_ERR(parent))
- d_set_d_op(parent, sb->s_root->d_op);
out:
unlock_super(sb);
@@ -1244,7 +1238,8 @@ static int fat_read_root(struct inode *inode)
* Read the super block of an MS-DOS FS.
*/
int fat_fill_super(struct super_block *sb, void *data, int silent,
- const struct inode_operations *fs_dir_inode_ops, int isvfat)
+ const struct inode_operations *fs_dir_inode_ops, int isvfat,
+ void (*setup)(struct super_block *))
{
struct inode *root_inode = NULL, *fat_inode = NULL;
struct buffer_head *bh;
@@ -1280,6 +1275,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
if (error)
goto out_fail;
+ setup(sb); /* flavour-specific stuff that needs options */
+
error = -EIO;
sb_min_blocksize(sb, 512);
bh = sb_bread(sb, 0);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 35ffe43afa4b..711499040eb6 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -227,11 +227,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
}
out:
unlock_super(sb);
- d_set_d_op(dentry, &msdos_dentry_operations);
- dentry = d_splice_alias(inode, dentry);
- if (dentry)
- d_set_d_op(dentry, &msdos_dentry_operations);
- return dentry;
+ return d_splice_alias(inode, dentry);
error:
unlock_super(sb);
@@ -661,21 +657,16 @@ static const struct inode_operations msdos_dir_inode_operations = {
.getattr = fat_getattr,
};
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
{
- int res;
-
- lock_super(sb);
- res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
- if (res) {
- unlock_super(sb);
- return res;
- }
-
+ sb->s_d_op = &msdos_dentry_operations;
sb->s_flags |= MS_NOATIME;
- d_set_d_op(sb->s_root, &msdos_dentry_operations);
- unlock_super(sb);
- return 0;
+}
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+ return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
+ 0, setup);
}
static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index e3ffc5e12332..f88f752babd9 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -772,13 +772,10 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
out:
unlock_super(sb);
- d_set_d_op(dentry, sb->s_root->d_op);
dentry->d_time = dentry->d_parent->d_inode->i_version;
dentry = d_splice_alias(inode, dentry);
- if (dentry) {
- d_set_d_op(dentry, sb->s_root->d_op);
+ if (dentry)
dentry->d_time = dentry->d_parent->d_inode->i_version;
- }
return dentry;
error:
@@ -1066,24 +1063,18 @@ static const struct inode_operations vfat_dir_inode_operations = {
.getattr = fat_getattr,
};
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static void setup(struct super_block *sb)
{
- int res;
-
- lock_super(sb);
- res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
- if (res) {
- unlock_super(sb);
- return res;
- }
-
if (MSDOS_SB(sb)->options.name_check != 's')
- d_set_d_op(sb->s_root, &vfat_ci_dentry_ops);
+ sb->s_d_op = &vfat_ci_dentry_ops;
else
- d_set_d_op(sb->s_root, &vfat_dentry_ops);
+ sb->s_d_op = &vfat_dentry_ops;
+}
- unlock_super(sb);
- return 0;
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+ return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
+ 1, setup);
}
static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b4..c3e89adf53c0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,7 +311,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
struct files_struct *files = current->files;
*fput_needed = 0;
- if (likely((atomic_read(&files->count) == 1))) {
+ if (atomic_read(&files->count) == 1) {
file = fcheck_files(files, fd);
} else {
rcu_read_lock();
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d06ccc953aa..59c6e4956786 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(struct list_head *head)
return list_entry(head, struct inode, i_wb_list);
}
-static void bdi_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
{
- trace_writeback_queue(bdi, work);
-
- spin_lock_bh(&bdi->wb_lock);
- list_add_tail(&work->list, &bdi->work_list);
if (bdi->wb.task) {
wake_up_process(bdi->wb.task);
} else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
* The bdi thread isn't there, wake up the forker thread which
* will create and run it.
*/
- trace_writeback_nothread(bdi, work);
wake_up_process(default_backing_dev_info.wb.task);
}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+{
+ trace_writeback_queue(bdi, work);
+
+ spin_lock_bh(&bdi->wb_lock);
+ list_add_tail(&work->list, &bdi->work_list);
+ if (!bdi->wb.task)
+ trace_writeback_nothread(bdi, work);
+ bdi_wakeup_flusher(bdi);
spin_unlock_bh(&bdi->wb_lock);
}
static void
__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic, bool for_background)
+ bool range_cyclic)
{
struct wb_writeback_work *work;
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->sync_mode = WB_SYNC_NONE;
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
- work->for_background = for_background;
bdi_queue_work(bdi, work);
}
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
*/
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
{
- __bdi_start_writeback(bdi, nr_pages, true, false);
+ __bdi_start_writeback(bdi, nr_pages, true);
}
/**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
* @bdi: the backing device to write from
*
* Description:
- * This does WB_SYNC_NONE background writeback. The IO is only
- * started when this function returns, we make no guarentees on
- * completion. Caller need not hold sb s_umount semaphore.
+ * This makes sure WB_SYNC_NONE background writeback happens. When
+ * this function returns, it is only guaranteed that for given BDI
+ * some IO is happening if we are over background dirty threshold.
+ * Caller need not hold sb s_umount semaphore.
*/
void bdi_start_background_writeback(struct backing_dev_info *bdi)
{
- __bdi_start_writeback(bdi, LONG_MAX, true, true);
+ /*
+ * We just wake up the flusher thread. It will perform background
+ * writeback as soon as there is no other work to do.
+ */
+ trace_writeback_wake_background(bdi);
+ spin_lock_bh(&bdi->wb_lock);
+ bdi_wakeup_flusher(bdi);
+ spin_unlock_bh(&bdi->wb_lock);
}
/*
@@ -616,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
};
unsigned long oldest_jif;
long wrote = 0;
+ long write_chunk;
struct inode *inode;
if (wbc.for_kupdate) {
@@ -628,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
wbc.range_end = LLONG_MAX;
}
+ /*
+ * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+ * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+ * here avoids calling into writeback_inodes_wb() more than once.
+ *
+ * The intended call sequence for WB_SYNC_ALL writeback is:
+ *
+ * wb_writeback()
+ * __writeback_inodes_sb() <== called only once
+ * write_cache_pages() <== called once for each inode
+ * (quickly) tag currently dirty pages
+ * (maybe slowly) sync all tagged pages
+ */
+ if (wbc.sync_mode == WB_SYNC_NONE)
+ write_chunk = MAX_WRITEBACK_PAGES;
+ else
+ write_chunk = LONG_MAX;
+
wbc.wb_start = jiffies; /* livelock avoidance */
for (;;) {
/*
@@ -637,6 +670,16 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
/*
+ * Background writeout and kupdate-style writeback may
+ * run forever. Stop them if there is other work to do
+ * so that e.g. sync can proceed. They'll be restarted
+ * after the other works are all done.
+ */
+ if ((work->for_background || work->for_kupdate) &&
+ !list_empty(&wb->bdi->work_list))
+ break;
+
+ /*
* For background writeout, stop when we are below the
* background dirty threshold
*/
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
wbc.more_io = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
writeback_inodes_wb(wb, &wbc);
trace_wbc_writeback_written(&wbc, wb->bdi);
- work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ work->nr_pages -= write_chunk - wbc.nr_to_write;
+ wrote += write_chunk - wbc.nr_to_write;
/*
* If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Did we write something? Try for more
*/
- if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ if (wbc.nr_to_write < write_chunk)
continue;
/*
* Nothing written. Wait for some inode to
@@ -718,6 +761,23 @@ static unsigned long get_nr_dirty_pages(void)
get_nr_dirty_inodes();
}
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+ if (over_bground_thresh()) {
+
+ struct wb_writeback_work work = {
+ .nr_pages = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .for_background = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &work);
+ }
+
+ return 0;
+}
+
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
unsigned long expired;
@@ -787,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
+ wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
@@ -873,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false, false);
+ __bdi_start_writeback(bdi, nr_pages, false);
}
rcu_read_unlock();
}
@@ -1164,7 +1225,7 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
* @sb: the superblock
*
* This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
*/
void sync_inodes_sb(struct super_block *sb)
{
@@ -1242,11 +1303,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
EXPORT_SYMBOL(sync_inode);
/**
- * sync_inode - write an inode to disk
+ * sync_inode_metadata - write an inode to disk
* @inode: the inode to sync
* @wait: wait for I/O to complete.
*
- * Write an inode to disk and adjust it's dirty state after completion.
+ * Write an inode to disk and adjust its dirty state after completion.
*
* Note: only writes the actual inode, no associated data or other metadata.
*/
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 68ca487bedb1..78b519c13536 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -4,6 +4,19 @@
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
+#include "internal.h"
+
+static inline void path_get_longterm(struct path *path)
+{
+ path_get(path);
+ mnt_make_longterm(path->mnt);
+}
+
+static inline void path_put_longterm(struct path *path)
+{
+ mnt_make_shortterm(path->mnt);
+ path_put(path);
+}
/*
* Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -17,11 +30,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
write_seqcount_begin(&fs->seq);
old_root = fs->root;
fs->root = *path;
- path_get_long(path);
+ path_get_longterm(path);
write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
if (old_root.dentry)
- path_put_long(&old_root);
+ path_put_longterm(&old_root);
}
/*
@@ -36,12 +49,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
write_seqcount_begin(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
- path_get_long(path);
+ path_get_longterm(path);
write_seqcount_end(&fs->seq);
spin_unlock(&fs->lock);
if (old_pwd.dentry)
- path_put_long(&old_pwd);
+ path_put_longterm(&old_pwd);
}
void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +72,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
write_seqcount_begin(&fs->seq);
if (fs->root.dentry == old_root->dentry
&& fs->root.mnt == old_root->mnt) {
- path_get_long(new_root);
+ path_get_longterm(new_root);
fs->root = *new_root;
count++;
}
if (fs->pwd.dentry == old_root->dentry
&& fs->pwd.mnt == old_root->mnt) {
- path_get_long(new_root);
+ path_get_longterm(new_root);
fs->pwd = *new_root;
count++;
}
@@ -76,13 +89,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
while (count--)
- path_put_long(old_root);
+ path_put_longterm(old_root);
}
void free_fs_struct(struct fs_struct *fs)
{
- path_put_long(&fs->root);
- path_put_long(&fs->pwd);
+ path_put_longterm(&fs->root);
+ path_put_longterm(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
@@ -118,9 +131,9 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
spin_lock(&old->lock);
fs->root = old->root;
- path_get_long(&fs->root);
+ path_get_longterm(&fs->root);
fs->pwd = old->pwd;
- path_get_long(&fs->pwd);
+ path_get_longterm(&fs->pwd);
spin_unlock(&old->lock);
}
return fs;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index b9f34eaede09..48a18f184d50 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -101,7 +101,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
- if (object->n_ops > 0) {
+ if (object->n_ops > 1) {
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 042af7346ec1..bfed8447ed80 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -350,7 +350,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
}
entry = newent ? newent : entry;
- d_set_d_op(entry, &fuse_dentry_operations);
if (outarg_valid)
fuse_change_entry_timeout(entry, &outarg);
else
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f62b32cffea9..9e3f68cc1bd1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -617,10 +617,8 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
goto out_iput;
entry = d_obtain_alias(inode);
- if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
- d_set_d_op(entry, &fuse_dentry_operations);
+ if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
fuse_invalidate_entry_cache(entry);
- }
return entry;
@@ -719,10 +717,8 @@ static struct dentry *fuse_get_parent(struct dentry *child)
}
parent = d_obtain_alias(inode);
- if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
- d_set_d_op(parent, &fuse_dentry_operations);
+ if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
fuse_invalidate_entry_cache(parent);
- }
return parent;
}
@@ -989,6 +985,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
iput(root);
goto err_put_conn;
}
+ /* only now - we want root dentry with NULL ->d_op */
+ sb->s_d_op = &fuse_dentry_operations;
init_req = fuse_request_alloc();
if (!init_req)
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 97012ecff560..9023db8184f9 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,12 +126,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
static struct dentry *gfs2_get_parent(struct dentry *child)
{
- struct dentry *dentry;
-
- dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
- if (!IS_ERR(dentry))
- d_set_d_op(dentry, &gfs2_dops);
- return dentry;
+ return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
}
static struct dentry *gfs2_get_dentry(struct super_block *sb,
@@ -139,7 +134,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct inode *inode;
- struct dentry *dentry;
inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
@@ -156,10 +150,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
return ERR_CAST(inode);
out_inode:
- dentry = d_obtain_alias(inode);
- if (!IS_ERR(dentry))
- d_set_d_op(dentry, &gfs2_dops);
- return dentry;
+ return d_obtain_alias(inode);
}
static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fca6689e12e6..7cfdcb913363 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -19,6 +19,8 @@
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
#include <linux/ext2_fs.h>
+#include <linux/falloc.h>
+#include <linux/swap.h>
#include <linux/crc32.h>
#include <linux/writeback.h>
#include <asm/uaccess.h>
@@ -610,6 +612,260 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
+static void empty_write_end(struct page *page, unsigned from,
+ unsigned to)
+{
+ struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+ page_zero_new_buffers(page, from, to);
+ flush_dcache_page(page);
+ mark_page_accessed(page);
+
+ if (!gfs2_is_writeback(ip))
+ gfs2_page_add_databufs(ip, page, from, to);
+
+ block_commit_write(page, from, to);
+}
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+ unsigned start, end, next;
+ struct buffer_head *bh, *head;
+ int error;
+
+ if (!page_has_buffers(page)) {
+ error = __block_write_begin(page, from, to - from, gfs2_block_map);
+ if (unlikely(error))
+ return error;
+
+ empty_write_end(page, from, to);
+ return 0;
+ }
+
+ bh = head = page_buffers(page);
+ next = end = 0;
+ while (next < from) {
+ next += bh->b_size;
+ bh = bh->b_this_page;
+ }
+ start = next;
+ do {
+ next += bh->b_size;
+ if (buffer_mapped(bh)) {
+ if (end) {
+ error = __block_write_begin(page, start, end - start,
+ gfs2_block_map);
+ if (unlikely(error))
+ return error;
+ empty_write_end(page, start, end);
+ end = 0;
+ }
+ start = next;
+ }
+ else
+ end = next;
+ bh = bh->b_this_page;
+ } while (next < to);
+
+ if (end) {
+ error = __block_write_begin(page, start, end - start, gfs2_block_map);
+ if (unlikely(error))
+ return error;
+ empty_write_end(page, start, end);
+ }
+
+ return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+ int mode)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *dibh;
+ int error;
+ u64 start = offset >> PAGE_CACHE_SHIFT;
+ unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+ u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t curr;
+ struct page *page;
+ unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+ unsigned int from, to;
+
+ if (!end_offset)
+ end_offset = PAGE_CACHE_SIZE;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (unlikely(error))
+ goto out;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (unlikely(error))
+ goto out;
+ }
+
+ curr = start;
+ offset = start << PAGE_CACHE_SHIFT;
+ from = start_offset;
+ to = PAGE_CACHE_SIZE;
+ while (curr <= end) {
+ page = grab_cache_page_write_begin(inode->i_mapping, curr,
+ AOP_FLAG_NOFS);
+ if (unlikely(!page)) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ if (curr == end)
+ to = end_offset;
+ error = write_empty_blocks(page, from, to);
+ if (!error && offset + to > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ i_size_write(inode, offset + to);
+ }
+ unlock_page(page);
+ page_cache_release(page);
+ if (error)
+ goto out;
+ curr++;
+ offset += PAGE_CACHE_SIZE;
+ from = 0;
+ }
+
+ gfs2_dinode_out(ip, dibh->b_data);
+ mark_inode_dirty(inode);
+
+ brelse(dibh);
+
+out:
+ return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+ unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+ const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+ unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+ for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+ tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+ max_data -= tmp;
+ }
+ /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+ so it might end up with fewer data blocks */
+ if (max_data <= *data_blocks)
+ return;
+ *data_blocks = max_data;
+ *ind_blocks = max_blocks - max_data;
+ *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+ if (*len > max) {
+ *len = max;
+ gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+ }
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+ loff_t bytes, max_bytes;
+ struct gfs2_alloc *al;
+ int error;
+ loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+ next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+ sdp->sd_sb.sb_bsize_shift;
+
+ len = next - offset;
+ bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+ if (!bytes)
+ bytes = UINT_MAX;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+ error = gfs2_glock_nq(&ip->i_gh);
+ if (unlikely(error))
+ goto out_uninit;
+
+ if (!gfs2_write_alloc_required(ip, offset, len))
+ goto out_unlock;
+
+ while (len > 0) {
+ if (len < bytes)
+ bytes = len;
+ al = gfs2_alloc_get(ip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_unlock;
+ }
+
+ error = gfs2_quota_lock_check(ip);
+ if (error)
+ goto out_alloc_put;
+
+retry:
+ gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+ al->al_requested = data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip);
+ if (error) {
+ if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+ bytes >>= 1;
+ goto retry;
+ }
+ goto out_qunlock;
+ }
+ max_bytes = bytes;
+ calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+ al->al_requested = data_blocks + ind_blocks;
+
+ rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+ RES_RG_HDR + gfs2_rg_blocks(al);
+ if (gfs2_is_jdata(ip))
+ rblocks += data_blocks ? data_blocks : 1;
+
+ error = gfs2_trans_begin(sdp, rblocks,
+ PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+ if (error)
+ goto out_trans_fail;
+
+ error = fallocate_chunk(inode, offset, max_bytes, mode);
+ gfs2_trans_end(sdp);
+
+ if (error)
+ goto out_trans_fail;
+
+ len -= max_bytes;
+ offset += max_bytes;
+ gfs2_inplace_release(ip);
+ gfs2_quota_unlock(ip);
+ gfs2_alloc_put(ip);
+ }
+ goto out_unlock;
+
+out_trans_fail:
+ gfs2_inplace_release(ip);
+out_qunlock:
+ gfs2_quota_unlock(ip);
+out_alloc_put:
+ gfs2_alloc_put(ip);
+out_unlock:
+ gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+ gfs2_holder_uninit(&ip->i_gh);
+ return error;
+}
+
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
/**
@@ -765,6 +1021,7 @@ const struct file_operations gfs2_file_fops = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.setlease = gfs2_setlease,
+ .fallocate = gfs2_fallocate,
};
const struct file_operations gfs2_dir_fops = {
@@ -794,6 +1051,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.setlease = generic_setlease,
+ .fallocate = gfs2_fallocate,
};
const struct file_operations gfs2_dir_fops_nolock = {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780bd..7aa7d4f8984a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -74,16 +74,14 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
}
/**
- * GFS2 lookup code fills in vfs inode contents based on info obtained
- * from directory entry inside gfs2_inode_lookup(). This has caused issues
- * with NFS code path since its get_dentry routine doesn't have the relevant
- * directory entry when gfs2_inode_lookup() is invoked. Part of the code
- * segment inside gfs2_inode_lookup code needs to get moved around.
+ * gfs2_set_iop - Sets inode operations
+ * @inode: The inode with correct i_mode filled in
*
- * Clears I_NEW as well.
- **/
+ * GFS2 lookup code fills in vfs inode contents based on info obtained
+ * from directory entry inside gfs2_inode_lookup().
+ */
-void gfs2_set_iop(struct inode *inode)
+static void gfs2_set_iop(struct inode *inode)
{
struct gfs2_sbd *sdp = GFS2_SB(inode);
umode_t mode = inode->i_mode;
@@ -106,8 +104,6 @@ void gfs2_set_iop(struct inode *inode)
inode->i_op = &gfs2_file_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
}
-
- unlock_new_inode(inode);
}
/**
@@ -119,10 +115,8 @@ void gfs2_set_iop(struct inode *inode)
* Returns: A VFS inode, or an error
*/
-struct inode *gfs2_inode_lookup(struct super_block *sb,
- unsigned int type,
- u64 no_addr,
- u64 no_formal_ino)
+struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
+ u64 no_addr, u64 no_formal_ino)
{
struct inode *inode;
struct gfs2_inode *ip;
@@ -152,51 +146,37 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail_iopen;
- ip->i_iopen_gh.gh_gl->gl_object = ip;
+ ip->i_iopen_gh.gh_gl->gl_object = ip;
gfs2_glock_put(io_gl);
io_gl = NULL;
- if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
- goto gfs2_nfsbypass;
-
- inode->i_mode = DT2IF(type);
-
- /*
- * We must read the inode in order to work out its type in
- * this case. Note that this doesn't happen often as we normally
- * know the type beforehand. This code path only occurs during
- * unlinked inode recovery (where it is safe to do this glock,
- * which is not true in the general case).
- */
if (type == DT_UNKNOWN) {
- struct gfs2_holder gh;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (unlikely(error))
- goto fail_glock;
- /* Inode is now uptodate */
- gfs2_glock_dq_uninit(&gh);
+ /* Inode glock must be locked already */
+ error = gfs2_inode_refresh(GFS2_I(inode));
+ if (error)
+ goto fail_refresh;
+ } else {
+ inode->i_mode = DT2IF(type);
}
gfs2_set_iop(inode);
+ unlock_new_inode(inode);
}
-gfs2_nfsbypass:
return inode;
-fail_glock:
- gfs2_glock_dq(&ip->i_iopen_gh);
+
+fail_refresh:
+ ip->i_iopen_gh.gh_gl->gl_object = NULL;
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_iopen:
if (io_gl)
gfs2_glock_put(io_gl);
fail_put:
- if (inode->i_state & I_NEW)
- ip->i_gl->gl_object = NULL;
+ ip->i_gl->gl_object = NULL;
gfs2_glock_put(ip->i_gl);
fail:
- if (inode->i_state & I_NEW)
- iget_failed(inode);
- else
- iput(inode);
+ iget_failed(inode);
return ERR_PTR(error);
}
@@ -221,14 +201,6 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
if (IS_ERR(inode))
goto fail;
- error = gfs2_inode_refresh(GFS2_I(inode));
- if (error)
- goto fail_iput;
-
- /* Pick up the works we bypass in gfs2_inode_lookup */
- if (inode->i_state & I_NEW)
- gfs2_set_iop(inode);
-
/* Two extra checks for NFS only */
if (no_formal_ino) {
error = -ESTALE;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 732a183efdb3..3e00a66e7cbd 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,7 +96,6 @@ err:
return -EIO;
}
-extern void gfs2_set_iop(struct inode *inode);
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
u64 no_addr, u64 no_formal_ino);
extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2aeabd4218cc..777927ce6f79 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -440,7 +440,6 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
iput(inode);
return -ENOMEM;
}
- d_set_d_op(dentry, &gfs2_dops);
*dptr = dentry;
return 0;
}
@@ -1106,6 +1105,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
+ sb->s_d_op = &gfs2_dops;
sb->s_export_op = &gfs2_export_ops;
sb->s_xattr = gfs2_xattr_handlers;
sb->s_qcop = &gfs2_quotactl_ops;
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
{
struct block_device *bdev;
struct super_block *s;
- fmode_t mode = FMODE_READ;
+ fmode_t mode = FMODE_READ | FMODE_EXCL;
int error;
struct gfs2_args args;
struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
- bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+ bdev = blkdev_get_by_path(dev_name, mode, fs_type);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
goto error_bdev;
if (s->s_root)
- close_bdev_exclusive(bdev, mode);
+ blkdev_put(bdev, mode);
memset(&args, 0, sizeof(args));
args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
deactivate_locked_super(s);
return ERR_PTR(error);
error_bdev:
- close_bdev_exclusive(bdev, mode);
+ blkdev_put(bdev, mode);
return ERR_PTR(error);
}
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1501db4f0e6d..d8b26ac2e20b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,8 +18,6 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/fiemap.h>
-#include <linux/swap.h>
-#include <linux/falloc.h>
#include <asm/uaccess.h>
#include "gfs2.h"
@@ -106,8 +104,6 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
{
struct inode *inode = NULL;
- d_set_d_op(dentry, &gfs2_dops);
-
inode = gfs2_lookupi(dir, &dentry->d_name, 0);
if (inode && IS_ERR(inode))
return ERR_CAST(inode);
@@ -1259,257 +1255,6 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
return ret;
}
-static void empty_write_end(struct page *page, unsigned from,
- unsigned to)
-{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-
- page_zero_new_buffers(page, from, to);
- flush_dcache_page(page);
- mark_page_accessed(page);
-
- if (!gfs2_is_writeback(ip))
- gfs2_page_add_databufs(ip, page, from, to);
-
- block_commit_write(page, from, to);
-}
-
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-{
- unsigned start, end, next;
- struct buffer_head *bh, *head;
- int error;
-
- if (!page_has_buffers(page)) {
- error = __block_write_begin(page, from, to - from, gfs2_block_map);
- if (unlikely(error))
- return error;
-
- empty_write_end(page, from, to);
- return 0;
- }
-
- bh = head = page_buffers(page);
- next = end = 0;
- while (next < from) {
- next += bh->b_size;
- bh = bh->b_this_page;
- }
- start = next;
- do {
- next += bh->b_size;
- if (buffer_mapped(bh)) {
- if (end) {
- error = __block_write_begin(page, start, end - start,
- gfs2_block_map);
- if (unlikely(error))
- return error;
- empty_write_end(page, start, end);
- end = 0;
- }
- start = next;
- }
- else
- end = next;
- bh = bh->b_this_page;
- } while (next < to);
-
- if (end) {
- error = __block_write_begin(page, start, end - start, gfs2_block_map);
- if (unlikely(error))
- return error;
- empty_write_end(page, start, end);
- }
-
- return 0;
-}
-
-static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
- int mode)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct buffer_head *dibh;
- int error;
- u64 start = offset >> PAGE_CACHE_SHIFT;
- unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
- u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
- pgoff_t curr;
- struct page *page;
- unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
- unsigned int from, to;
-
- if (!end_offset)
- end_offset = PAGE_CACHE_SIZE;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (unlikely(error))
- goto out;
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
- if (gfs2_is_stuffed(ip)) {
- error = gfs2_unstuff_dinode(ip, NULL);
- if (unlikely(error))
- goto out;
- }
-
- curr = start;
- offset = start << PAGE_CACHE_SHIFT;
- from = start_offset;
- to = PAGE_CACHE_SIZE;
- while (curr <= end) {
- page = grab_cache_page_write_begin(inode->i_mapping, curr,
- AOP_FLAG_NOFS);
- if (unlikely(!page)) {
- error = -ENOMEM;
- goto out;
- }
-
- if (curr == end)
- to = end_offset;
- error = write_empty_blocks(page, from, to);
- if (!error && offset + to > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- i_size_write(inode, offset + to);
- }
- unlock_page(page);
- page_cache_release(page);
- if (error)
- goto out;
- curr++;
- offset += PAGE_CACHE_SIZE;
- from = 0;
- }
-
- gfs2_dinode_out(ip, dibh->b_data);
- mark_inode_dirty(inode);
-
- brelse(dibh);
-
-out:
- return error;
-}
-
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
- unsigned int *data_blocks, unsigned int *ind_blocks)
-{
- const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
- unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
-
- for (tmp = max_data; tmp > sdp->sd_diptrs;) {
- tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
- max_data -= tmp;
- }
- /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
- so it might end up with fewer data blocks */
- if (max_data <= *data_blocks)
- return;
- *data_blocks = max_data;
- *ind_blocks = max_blocks - max_data;
- *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
- if (*len > max) {
- *len = max;
- gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
- }
-}
-
-static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
- loff_t len)
-{
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_inode *ip = GFS2_I(inode);
- unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
- loff_t bytes, max_bytes;
- struct gfs2_alloc *al;
- int error;
- loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
- next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
-
- offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
- sdp->sd_sb.sb_bsize_shift;
-
- len = next - offset;
- bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
- if (!bytes)
- bytes = UINT_MAX;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
- error = gfs2_glock_nq(&ip->i_gh);
- if (unlikely(error))
- goto out_uninit;
-
- if (!gfs2_write_alloc_required(ip, offset, len))
- goto out_unlock;
-
- while (len > 0) {
- if (len < bytes)
- bytes = len;
- al = gfs2_alloc_get(ip);
- if (!al) {
- error = -ENOMEM;
- goto out_unlock;
- }
-
- error = gfs2_quota_lock_check(ip);
- if (error)
- goto out_alloc_put;
-
-retry:
- gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-
- al->al_requested = data_blocks + ind_blocks;
- error = gfs2_inplace_reserve(ip);
- if (error) {
- if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
- bytes >>= 1;
- goto retry;
- }
- goto out_qunlock;
- }
- max_bytes = bytes;
- calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
- al->al_requested = data_blocks + ind_blocks;
-
- rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
- RES_RG_HDR + gfs2_rg_blocks(al);
- if (gfs2_is_jdata(ip))
- rblocks += data_blocks ? data_blocks : 1;
-
- error = gfs2_trans_begin(sdp, rblocks,
- PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
- if (error)
- goto out_trans_fail;
-
- error = fallocate_chunk(inode, offset, max_bytes, mode);
- gfs2_trans_end(sdp);
-
- if (error)
- goto out_trans_fail;
-
- len -= max_bytes;
- offset += max_bytes;
- gfs2_inplace_release(ip);
- gfs2_quota_unlock(ip);
- gfs2_alloc_put(ip);
- }
- goto out_unlock;
-
-out_trans_fail:
- gfs2_inplace_release(ip);
-out_qunlock:
- gfs2_quota_unlock(ip);
-out_alloc_put:
- gfs2_alloc_put(ip);
-out_unlock:
- gfs2_glock_dq(&ip->i_gh);
-out_uninit:
- gfs2_holder_uninit(&ip->i_gh);
- return error;
-}
-
-
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -1560,7 +1305,6 @@ const struct inode_operations gfs2_file_iops = {
.getxattr = gfs2_getxattr,
.listxattr = gfs2_listxattr,
.removexattr = gfs2_removexattr,
- .fallocate = gfs2_fallocate,
.fiemap = gfs2_fiemap,
};
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 16c2ecac7eb7..ec73ed70bae1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1336,6 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode)
if (error)
goto out_truncate;
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_wait(&ip->i_iopen_gh);
gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index ea4aefe7c652..afa66aaa2237 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,8 +25,6 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
int res;
- d_set_d_op(dentry, &hfs_dentry_operations);
-
hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
res = hfs_brec_read(&fd, &rec, sizeof(rec));
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 0bef62aa4f42..1b55f704fb22 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -429,13 +429,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
if (!root_inode)
goto bail_no_root;
+ sb->s_d_op = &hfs_dentry_operations;
res = -ENOMEM;
sb->s_root = d_alloc_root(root_inode);
if (!sb->s_root)
goto bail_iput;
- d_set_d_op(sb->s_root, &hfs_dentry_operations);
-
/* everything's okay */
return 0;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f896dc843026..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -37,7 +37,6 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
sb = dir->i_sb;
- d_set_d_op(dentry, &hfsplus_dentry_operations);
dentry->d_fsdata = NULL;
hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 6ee6ad20acf2..9a3b4795f43c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -444,13 +444,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = PTR_ERR(root);
goto cleanup;
}
+ sb->s_d_op = &hfsplus_dentry_operations;
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
iput(root);
err = -ENOMEM;
goto cleanup;
}
- d_set_d_op(sb->s_root, &hfsplus_dentry_operations);
str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
str.name = HFSP_HIDDENDIR_NAME;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d3244d949a4e..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -612,7 +612,6 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
goto out_put;
d_add(dentry, inode);
- d_set_d_op(dentry, &hostfs_dentry_ops);
return NULL;
out_put:
@@ -922,6 +921,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_blocksize_bits = 10;
sb->s_magic = HOSTFS_SUPER_MAGIC;
sb->s_op = &hostfs_sbops;
+ sb->s_d_op = &hostfs_dentry_ops;
sb->s_maxbytes = MAX_LFS_FILESIZE;
/* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 32c13a94e1e9..05d4816e4e77 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -58,12 +58,7 @@ static int hpfs_compare_dentry(const struct dentry *parent,
return 0;
}
-static const struct dentry_operations hpfs_dentry_operations = {
+const struct dentry_operations hpfs_dentry_operations = {
.d_hash = hpfs_hash_dentry,
.d_compare = hpfs_compare_dentry,
};
-
-void hpfs_set_dentry_operations(struct dentry *dentry)
-{
- d_set_d_op(dentry, &hpfs_dentry_operations);
-}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2338130cceba..d32f63a569f7 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -298,7 +298,6 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
end:
end_add:
- hpfs_set_dentry_operations(dentry);
unlock_kernel();
d_add(dentry, result);
return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2fee17d0d9ab..1c43dbea55e8 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -233,7 +233,7 @@ void hpfs_mark_4buffers_dirty(struct quad_buffer_head *);
/* dentry.c */
-void hpfs_set_dentry_operations(struct dentry *);
+extern const struct dentry_operations hpfs_dentry_operations;
/* dir.c */
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f0da1cfd10..1ae35baa539e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -281,7 +281,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size != i_size_read(inode)) {
error = vmtruncate(inode, attr->ia_size);
if (error)
- return error;
+ goto out_unlock;
}
setattr_copy(inode, attr);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 49935ba78db8..b30426b1fc97 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -550,6 +550,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
/* Fill superblock stuff */
s->s_magic = HPFS_SUPER_MAGIC;
s->s_op = &hpfs_sops;
+ s->s_d_op = &hpfs_dentry_operations;
sbi->sb_root = superblock->root;
sbi->sb_fs_size = superblock->n_sectors;
@@ -651,7 +652,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
iput(root);
goto bail0;
}
- hpfs_set_dentry_operations(s->s_root);
/*
* find the root directory's . pointer & finish filling in the inode
diff --git a/fs/internal.h b/fs/internal.h
index 9687c2ee2735..0663568b1247 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,10 @@ extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
extern void release_mounts(struct list_head *);
extern void umount_tree(struct vfsmount *, int, struct list_head *);
extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern int finish_automount(struct vfsmount *, struct path *);
+
+extern void mnt_make_longterm(struct vfsmount *);
+extern void mnt_make_shortterm(struct vfsmount *);
extern void __init mnt_init(void);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d6cc16476620..a59635e295fa 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -86,7 +86,7 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
u64 phys, u64 len, u32 flags)
{
struct fiemap_extent extent;
- struct fiemap_extent *dest = fieinfo->fi_extents_start;
+ struct fiemap_extent __user *dest = fieinfo->fi_extents_start;
/* only count the extents */
if (fieinfo->fi_extents_max == 0) {
@@ -173,6 +173,7 @@ static int fiemap_check_ranges(struct super_block *sb,
static int ioctl_fiemap(struct file *filp, unsigned long arg)
{
struct fiemap fiemap;
+ struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
struct inode *inode = filp->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
@@ -182,8 +183,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
if (!inode->i_op->fiemap)
return -EOPNOTSUPP;
- if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
- sizeof(struct fiemap)))
+ if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
return -EFAULT;
if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
@@ -196,7 +196,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
fieinfo.fi_flags = fiemap.fm_flags;
fieinfo.fi_extents_max = fiemap.fm_extent_count;
- fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+ fieinfo.fi_extents_start = ufiemap->fm_extents;
if (fiemap.fm_extent_count != 0 &&
!access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
@@ -209,7 +209,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
fiemap.fm_flags = fieinfo.fi_flags;
fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
- if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+ if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
error = -EFAULT;
return error;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 844a7903c72f..a0f3833c0dbf 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,17 +939,18 @@ root_found:
goto out_iput;
}
- /* get the root dentry */
- s->s_root = d_alloc_root(inode);
- if (!(s->s_root))
- goto out_no_root;
-
table = 0;
if (joliet_level)
table += 2;
if (opt.check == 'r')
table++;
- d_set_d_op(s->s_root, &isofs_dentry_ops[table]);
+
+ s->s_d_op = &isofs_dentry_ops[table];
+
+ /* get the root dentry */
+ s->s_root = d_alloc_root(inode);
+ if (!(s->s_root))
+ goto out_no_root;
kfree(opt.iocharset);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 679a849c3b27..4fb3e8074fd4 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -172,8 +172,6 @@ struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, struct nam
struct inode *inode;
struct page *page;
- d_set_d_op(dentry, dir->i_sb->s_root->d_op);
-
page = alloc_page(GFP_USER);
if (!page)
return ERR_PTR(-ENOMEM);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 846a3f314111..5b2e4c30a2a1 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -207,7 +207,7 @@ repeat_locked:
* the committing transaction. Really, we only need to give it
* committing_transaction->t_outstanding_credits plus "enough" for
* the log control blocks.
- * Also, this test is inconsitent with the matching one in
+ * Also, this test is inconsistent with the matching one in
* journal_extend().
*/
if (__log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 394893242ae3..faad2bd787c7 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -251,7 +251,7 @@ repeat:
* the committing transaction. Really, we only need to give it
* committing_transaction->t_outstanding_credits plus "enough" for
* the log control blocks.
- * Also, this test is inconsitent with the matching one in
+ * Also, this test is inconsistent with the matching one in
* jbd2_journal_extend().
*/
if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 85c6be2db02f..3005ec4520ad 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -336,14 +336,13 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
size = sizeof(struct jffs2_eraseblock) * c->nr_blocks;
#ifndef __ECOS
if (jffs2_blocks_use_vmalloc(c))
- c->blocks = vmalloc(size);
+ c->blocks = vzalloc(size);
else
#endif
- c->blocks = kmalloc(size, GFP_KERNEL);
+ c->blocks = kzalloc(size, GFP_KERNEL);
if (!c->blocks)
return -ENOMEM;
- memset(c->blocks, 0, size);
for (i=0; i<c->nr_blocks; i++) {
INIT_LIST_HEAD(&c->blocks[i].list);
c->blocks[i].offset = i * c->sector_size;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index f864005de64c..0bc6a6c80a56 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -144,4 +144,4 @@ struct jffs2_sb_info {
void *os_priv;
};
-#endif /* _JFFS2_FB_SB */
+#endif /* _JFFS2_FS_SB */
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9b572ca40a49..4f9cc0482949 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -151,7 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
offset, je32_to_cpu(rx.hdr_crc), crc);
xd->flags |= JFFS2_XFLAGS_INVALID;
- return EIO;
+ return -EIO;
}
totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
@@ -167,7 +167,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
je32_to_cpu(rx.xid), xd->xid,
je32_to_cpu(rx.version), xd->version);
xd->flags |= JFFS2_XFLAGS_INVALID;
- return EIO;
+ return -EIO;
}
xd->xprefix = rx.xprefix;
xd->name_len = rx.name_len;
@@ -230,7 +230,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
ref_offset(xd->node), xd->data_crc, crc);
kfree(data);
xd->flags |= JFFS2_XFLAGS_INVALID;
- return EIO;
+ return -EIO;
}
xd->flags |= JFFS2_XFLAGS_HOT;
@@ -268,7 +268,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
if (xd->xname)
return 0;
if (xd->flags & JFFS2_XFLAGS_INVALID)
- return EIO;
+ return -EIO;
if (unlikely(is_xattr_datum_unchecked(c, xd)))
rc = do_verify_xattr_datum(c, xd);
if (!rc)
@@ -460,7 +460,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
if (crc != je32_to_cpu(rr.node_crc)) {
JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
offset, je32_to_cpu(rr.node_crc), crc);
- return EIO;
+ return -EIO;
}
if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
|| je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
@@ -470,7 +470,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
- return EIO;
+ return -EIO;
}
ref->ino = je32_to_cpu(rr.ino);
ref->xid = je32_to_cpu(rr.xid);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aaa..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
* file systems to log may have n-to-1 relationship;
*/
- bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ log);
if (IS_ERR(bdev)) {
rc = -PTR_ERR(bdev);
goto free;
}
- if ((rc = bd_claim(bdev, log))) {
- goto close;
- }
-
log->bdev = bdev;
memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
* initialize log:
*/
if ((rc = lmLogInit(log)))
- goto unclaim;
+ goto close;
list_add(&log->journal_list, &jfs_external_logs);
@@ -1163,11 +1160,8 @@ journal_found:
list_del(&log->journal_list);
lbmLogShutdown(log);
- unclaim:
- bd_release(bdev);
-
close: /* close external log device */
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
free: /* free log descriptor */
mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
bdev = log->bdev;
rc = lmLogShutdown(log);
- bd_release(bdev);
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
kfree(log);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4414e3a42264..81ead850ddb6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1465,9 +1465,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
jfs_info("jfs_lookup: name = %s", name);
- if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
- d_set_d_op(dentry, &jfs_ci_dentry_operations);
-
if ((name[0] == '.') && (len == 1))
inum = dip->i_ino;
else if (strcmp(name, "..") == 0)
@@ -1492,12 +1489,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
return ERR_CAST(ip);
}
- dentry = d_splice_alias(ip, dentry);
-
- if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
- d_set_d_op(dentry, &jfs_ci_dentry_operations);
-
- return dentry;
+ return d_splice_alias(ip, dentry);
}
static struct inode *jfs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3150d766e0d4..eeca48a031ab 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -515,6 +515,9 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = JFS_SUPER_MAGIC;
+ if (sbi->mntflag & JFS_OS2)
+ sb->s_d_op = &jfs_ci_dentry_operations;
+
inode = jfs_iget(sb, ROOT_I);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -524,9 +527,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb->s_root)
goto out_no_root;
- if (sbi->mntflag & JFS_OS2)
- d_set_d_op(sb->s_root, &jfs_ci_dentry_operations);
-
/* logical blocks are represented by 40 bits in pxd_t, etc. */
sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
#if BITS_PER_LONG == 32
diff --git a/fs/libfs.c b/fs/libfs.c
index 889311e3d06b..c88eab55aec9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,8 @@ static const struct super_operations simple_super_operations = {
* will never be mountable)
*/
struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
- const struct super_operations *ops, unsigned long magic)
+ const struct super_operations *ops,
+ const struct dentry_operations *dops, unsigned long magic)
{
struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
struct dentry *dentry;
@@ -254,6 +255,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
dentry->d_parent = dentry;
d_instantiate(dentry, root);
s->s_root = dentry;
+ s->s_d_op = dops;
s->s_flags |= MS_ACTIVE;
return dget(s->s_root);
diff --git a/fs/locks.c b/fs/locks.c
index 08415b2a6d36..0f3998291f78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -444,15 +444,9 @@ static void lease_release_private_callback(struct file_lock *fl)
fl->fl_file->f_owner.signum = 0;
}
-static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
-{
- return fl->fl_file == try->fl_file;
-}
-
static const struct lock_manager_operations lease_manager_ops = {
.fl_break = lease_break_callback,
.fl_release_private = lease_release_private_callback,
- .fl_mylease = lease_mylease_callback,
.fl_change = lease_modify,
};
@@ -1405,7 +1399,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
for (before = &inode->i_flock;
((fl = *before) != NULL) && IS_LEASE(fl);
before = &fl->fl_next) {
- if (lease->fl_lmops->fl_mylease(fl, lease))
+ if (fl->fl_file == filp)
my_before = before;
else if (fl->fl_type == (F_INPROGRESS | F_UNLCK))
/*
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09bd..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
static void bdev_put_device(struct logfs_super *s)
{
- close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -325,13 +325,14 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
{
struct block_device *bdev;
- bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+ bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ type);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
int mtdnr = MINOR(bdev->bd_dev);
- close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
return logfs_get_sb_mtd(p, mtdnr);
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 1b9e07728a9f..ce7337ddfdbf 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -23,8 +23,6 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, st
struct inode * inode = NULL;
ino_t ino;
- d_set_d_op(dentry, dir->i_sb->s_root->d_op);
-
if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/mpage.c b/fs/mpage.c
index fd56ca2ea556..d78455a81ec9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -40,7 +40,7 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io_read(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -50,44 +50,29 @@ static void mpage_end_io_read(struct bio *bio, int err)
if (--bvec >= bio->bi_io_vec)
prefetchw(&bvec->bv_page->flags);
-
- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- } while (bvec >= bio->bi_io_vec);
- bio_put(bio);
-}
-
-static void mpage_end_io_write(struct bio *bio, int err)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
- do {
- struct page *page = bvec->bv_page;
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (!uptodate){
- SetPageError(page);
- if (page->mapping)
- set_bit(AS_EIO, &page->mapping->flags);
+ if (bio_data_dir(bio) == READ) {
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else { /* bio_data_dir(bio) == WRITE */
+ if (!uptodate) {
+ SetPageError(page);
+ if (page->mapping)
+ set_bit(AS_EIO, &page->mapping->flags);
+ }
+ end_page_writeback(page);
}
- end_page_writeback(page);
} while (bvec >= bio->bi_io_vec);
bio_put(bio);
}
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
- bio->bi_end_io = mpage_end_io_read;
- if (rw == WRITE)
- bio->bi_end_io = mpage_end_io_write;
+ bio->bi_end_io = mpage_end_io;
submit_bio(rw, bio);
return NULL;
}
diff --git a/fs/namei.c b/fs/namei.c
index 24ece10470b6..7d77f24d32a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -368,18 +368,6 @@ void path_get(struct path *path)
EXPORT_SYMBOL(path_get);
/**
- * path_get_long - get a long reference to a path
- * @path: path to get the reference to
- *
- * Given a path increment the reference count to the dentry and the vfsmount.
- */
-void path_get_long(struct path *path)
-{
- mntget_long(path->mnt);
- dget(path->dentry);
-}
-
-/**
* path_put - put a reference to a path
* @path: path to put the reference to
*
@@ -393,18 +381,6 @@ void path_put(struct path *path)
EXPORT_SYMBOL(path_put);
/**
- * path_put_long - put a long reference to a path
- * @path: path to put the reference to
- *
- * Given a path decrement the reference count to the dentry and the vfsmount.
- */
-void path_put_long(struct path *path)
-{
- dput(path->dentry);
- mntput_long(path->mnt);
-}
-
-/**
* nameidata_drop_rcu - drop this nameidata out of rcu-walk
* @nd: nameidata pathwalk data to drop
* Returns: 0 on success, -ECHILD on failure
@@ -479,6 +455,14 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
+ /*
+ * It can be possible to revalidate the dentry that we started
+ * the path walk with. force_reval_path may also revalidate the
+ * dentry already committed to the nameidata.
+ */
+ if (unlikely(parent == dentry))
+ return nameidata_drop_rcu(nd);
+
BUG_ON(!(nd->flags & LOOKUP_RCU));
if (nd->root.mnt) {
spin_lock(&fs->lock);
@@ -583,6 +567,13 @@ void release_open_intent(struct nameidata *nd)
fput(nd->intent.open.file);
}
+/*
+ * Call d_revalidate and handle filesystems that request rcu-walk
+ * to be dropped. This may be called and return in rcu-walk mode,
+ * regardless of success or error. If -ECHILD is returned, the caller
+ * must return -ECHILD back up the path walk stack so path walk may
+ * be restarted in ref-walk mode.
+ */
static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
{
int status;
@@ -673,6 +664,9 @@ force_reval_path(struct path *path, struct nameidata *nd)
return 0;
if (!status) {
+ /* Don't d_invalidate in rcu-walk mode */
+ if (nameidata_drop_rcu(nd))
+ return -ECHILD;
d_invalidate(dentry);
status = -ESTALE;
}
@@ -761,7 +755,8 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
mntput(path->mnt);
}
-static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+static inline void path_to_nameidata(const struct path *path,
+ struct nameidata *nd)
{
if (!(nd->flags & LOOKUP_RCU)) {
dput(nd->path.dentry);
@@ -773,20 +768,16 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
}
static __always_inline int
-__do_follow_link(struct path *path, struct nameidata *nd, void **p)
+__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
{
int error;
- struct dentry *dentry = path->dentry;
+ struct dentry *dentry = link->dentry;
- touch_atime(path->mnt, dentry);
+ touch_atime(link->mnt, dentry);
nd_set_link(nd, NULL);
- if (path->mnt != nd->path.mnt) {
- path_to_nameidata(path, nd);
- nd->inode = nd->path.dentry->d_inode;
- dget(dentry);
- }
- mntget(path->mnt);
+ if (link->mnt == nd->path.mnt)
+ mntget(link->mnt);
nd->last_type = LAST_BIND;
*p = dentry->d_inode->i_op->follow_link(dentry, nd);
@@ -877,54 +868,148 @@ int follow_up(struct path *path)
}
/*
- * serialization is taken care of in namespace.c
+ * Perform an automount
+ * - return -EISDIR to tell follow_managed() to stop and return the path we
+ * were called with.
*/
-static void __follow_mount_rcu(struct nameidata *nd, struct path *path,
- struct inode **inode)
+static int follow_automount(struct path *path, unsigned flags,
+ bool *need_mntput)
{
- while (d_mountpoint(path->dentry)) {
- struct vfsmount *mounted;
- mounted = __lookup_mnt(path->mnt, path->dentry, 1);
- if (!mounted)
- return;
- path->mnt = mounted;
- path->dentry = mounted->mnt_root;
- nd->seq = read_seqcount_begin(&path->dentry->d_seq);
- *inode = path->dentry->d_inode;
+ struct vfsmount *mnt;
+ int err;
+
+ if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+ return -EREMOTE;
+
+ /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
+ * and this is the terminal part of the path.
+ */
+ if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE))
+ return -EISDIR; /* we actually want to stop here */
+
+ /* We want to mount if someone is trying to open/create a file of any
+ * type under the mountpoint, wants to traverse through the mountpoint
+ * or wants to open the mounted directory.
+ *
+ * We don't want to mount if someone's just doing a stat and they've
+ * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
+ * appended a '/' to the name.
+ */
+ if (!(flags & LOOKUP_FOLLOW) &&
+ !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY |
+ LOOKUP_OPEN | LOOKUP_CREATE)))
+ return -EISDIR;
+
+ current->total_link_count++;
+ if (current->total_link_count >= 40)
+ return -ELOOP;
+
+ mnt = path->dentry->d_op->d_automount(path);
+ if (IS_ERR(mnt)) {
+ /*
+ * The filesystem is allowed to return -EISDIR here to indicate
+ * it doesn't want to automount. For instance, autofs would do
+ * this so that its userspace daemon can mount on this dentry.
+ *
+ * However, we can only permit this if it's a terminal point in
+ * the path being looked up; if it wasn't then the remainder of
+ * the path is inaccessible and we should say so.
+ */
+ if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE))
+ return -EREMOTE;
+ return PTR_ERR(mnt);
}
-}
-static int __follow_mount(struct path *path)
-{
- int res = 0;
- while (d_mountpoint(path->dentry)) {
- struct vfsmount *mounted = lookup_mnt(path);
- if (!mounted)
- break;
+ if (!mnt) /* mount collision */
+ return 0;
+
+ err = finish_automount(mnt, path);
+
+ switch (err) {
+ case -EBUSY:
+ /* Someone else made a mount here whilst we were busy */
+ return 0;
+ case 0:
dput(path->dentry);
- if (res)
+ if (*need_mntput)
mntput(path->mnt);
- path->mnt = mounted;
- path->dentry = dget(mounted->mnt_root);
- res = 1;
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ *need_mntput = true;
+ return 0;
+ default:
+ return err;
}
- return res;
+
}
-static void follow_mount(struct path *path)
+/*
+ * Handle a dentry that is managed in some way.
+ * - Flagged for transit management (autofs)
+ * - Flagged as mountpoint
+ * - Flagged as automount point
+ *
+ * This may only be called in refwalk mode.
+ *
+ * Serialization is taken care of in namespace.c
+ */
+static int follow_managed(struct path *path, unsigned flags)
{
- while (d_mountpoint(path->dentry)) {
- struct vfsmount *mounted = lookup_mnt(path);
- if (!mounted)
- break;
- dput(path->dentry);
- mntput(path->mnt);
- path->mnt = mounted;
- path->dentry = dget(mounted->mnt_root);
+ unsigned managed;
+ bool need_mntput = false;
+ int ret;
+
+ /* Given that we're not holding a lock here, we retain the value in a
+ * local variable for each dentry as we look at it so that we don't see
+ * the components of that value change under us */
+ while (managed = ACCESS_ONCE(path->dentry->d_flags),
+ managed &= DCACHE_MANAGED_DENTRY,
+ unlikely(managed != 0)) {
+ /* Allow the filesystem to manage the transit without i_mutex
+ * being held. */
+ if (managed & DCACHE_MANAGE_TRANSIT) {
+ BUG_ON(!path->dentry->d_op);
+ BUG_ON(!path->dentry->d_op->d_manage);
+ ret = path->dentry->d_op->d_manage(path->dentry,
+ false, false);
+ if (ret < 0)
+ return ret == -EISDIR ? 0 : ret;
+ }
+
+ /* Transit to a mounted filesystem. */
+ if (managed & DCACHE_MOUNTED) {
+ struct vfsmount *mounted = lookup_mnt(path);
+ if (mounted) {
+ dput(path->dentry);
+ if (need_mntput)
+ mntput(path->mnt);
+ path->mnt = mounted;
+ path->dentry = dget(mounted->mnt_root);
+ need_mntput = true;
+ continue;
+ }
+
+ /* Something is mounted on this dentry in another
+ * namespace and/or whatever was mounted there in this
+ * namespace got unmounted before we managed to get the
+ * vfsmount_lock */
+ }
+
+ /* Handle an automount point */
+ if (managed & DCACHE_NEED_AUTOMOUNT) {
+ ret = follow_automount(path, flags, &need_mntput);
+ if (ret < 0)
+ return ret == -EISDIR ? 0 : ret;
+ continue;
+ }
+
+ /* We didn't change the current path point */
+ break;
}
+ return 0;
}
-int follow_down(struct path *path)
+int follow_down_one(struct path *path)
{
struct vfsmount *mounted;
@@ -939,13 +1024,41 @@ int follow_down(struct path *path)
return 0;
}
+/*
+ * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
+ * meet a managed dentry and we're not walking to "..". True is returned to
+ * continue, false to abort.
+ */
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+ struct inode **inode, bool reverse_transit)
+{
+ while (d_mountpoint(path->dentry)) {
+ struct vfsmount *mounted;
+ if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+ !reverse_transit &&
+ path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+ return false;
+ mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+ if (!mounted)
+ break;
+ path->mnt = mounted;
+ path->dentry = mounted->mnt_root;
+ nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+ *inode = path->dentry->d_inode;
+ }
+
+ if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+ return reverse_transit;
+ return true;
+}
+
static int follow_dotdot_rcu(struct nameidata *nd)
{
struct inode *inode = nd->inode;
set_root_rcu(nd);
- while(1) {
+ while (1) {
if (nd->path.dentry == nd->root.dentry &&
nd->path.mnt == nd->root.mnt) {
break;
@@ -968,12 +1081,80 @@ static int follow_dotdot_rcu(struct nameidata *nd)
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
inode = nd->path.dentry->d_inode;
}
- __follow_mount_rcu(nd, &nd->path, &inode);
+ __follow_mount_rcu(nd, &nd->path, &inode, true);
nd->inode = inode;
return 0;
}
+/*
+ * Follow down to the covering mount currently visible to userspace. At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ *
+ * Care must be taken as namespace_sem may be held (indicated by mounting_here
+ * being true).
+ */
+int follow_down(struct path *path, bool mounting_here)
+{
+ unsigned managed;
+ int ret;
+
+ while (managed = ACCESS_ONCE(path->dentry->d_flags),
+ unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+ /* Allow the filesystem to manage the transit without i_mutex
+ * being held.
+ *
+ * We indicate to the filesystem if someone is trying to mount
+ * something here. This gives autofs the chance to deny anyone
+ * other than its daemon the right to mount on its
+ * superstructure.
+ *
+ * The filesystem may sleep at this point.
+ */
+ if (managed & DCACHE_MANAGE_TRANSIT) {
+ BUG_ON(!path->dentry->d_op);
+ BUG_ON(!path->dentry->d_op->d_manage);
+ ret = path->dentry->d_op->d_manage(
+ path->dentry, mounting_here, false);
+ if (ret < 0)
+ return ret == -EISDIR ? 0 : ret;
+ }
+
+ /* Transit to a mounted filesystem. */
+ if (managed & DCACHE_MOUNTED) {
+ struct vfsmount *mounted = lookup_mnt(path);
+ if (!mounted)
+ break;
+ dput(path->dentry);
+ mntput(path->mnt);
+ path->mnt = mounted;
+ path->dentry = dget(mounted->mnt_root);
+ continue;
+ }
+
+ /* Don't handle automount points here */
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+ */
+static void follow_mount(struct path *path)
+{
+ while (d_mountpoint(path->dentry)) {
+ struct vfsmount *mounted = lookup_mnt(path);
+ if (!mounted)
+ break;
+ dput(path->dentry);
+ mntput(path->mnt);
+ path->mnt = mounted;
+ path->dentry = dget(mounted->mnt_root);
+ }
+}
+
static void follow_dotdot(struct nameidata *nd)
{
set_root(nd);
@@ -1038,12 +1219,14 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
struct inode *dir;
+ int err;
+
/*
* See if the low-level filesystem might want
* to use its own hash..
*/
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
- int err = parent->d_op->d_hash(parent, nd->inode, name);
+ err = parent->d_op->d_hash(parent, nd->inode, name);
if (err < 0)
return err;
}
@@ -1070,22 +1253,30 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
nd->seq = seq;
if (dentry->d_flags & DCACHE_OP_REVALIDATE)
goto need_revalidate;
+done2:
path->mnt = mnt;
path->dentry = dentry;
- __follow_mount_rcu(nd, path, inode);
- } else {
- dentry = __d_lookup(parent, name);
- if (!dentry)
- goto need_lookup;
+ if (likely(__follow_mount_rcu(nd, path, inode, false)))
+ return 0;
+ if (nameidata_drop_rcu(nd))
+ return -ECHILD;
+ /* fallthru */
+ }
+ dentry = __d_lookup(parent, name);
+ if (!dentry)
+ goto need_lookup;
found:
- if (dentry->d_flags & DCACHE_OP_REVALIDATE)
- goto need_revalidate;
+ if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+ goto need_revalidate;
done:
- path->mnt = mnt;
- path->dentry = dentry;
- __follow_mount(path);
- *inode = path->dentry->d_inode;
- }
+ path->mnt = mnt;
+ path->dentry = dentry;
+ err = follow_managed(path, nd->flags);
+ if (unlikely(err < 0)) {
+ path_put_conditional(path, nd);
+ return err;
+ }
+ *inode = path->dentry->d_inode;
return 0;
need_lookup:
@@ -1124,6 +1315,8 @@ need_revalidate:
goto need_lookup;
if (IS_ERR(dentry))
goto fail;
+ if (nd->flags & LOOKUP_RCU)
+ goto done2;
goto done;
fail:
@@ -1131,17 +1324,6 @@ fail:
}
/*
- * This is a temporary kludge to deal with "automount" symlinks; proper
- * solution is to trigger them on follow_mount(), so that do_lookup()
- * would DTRT. To be killed before 2.6.34-final.
- */
-static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
-{
- return inode && unlikely(inode->i_op->follow_link) &&
- ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
-}
-
-/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
@@ -1279,7 +1461,8 @@ last_component:
err = do_lookup(nd, &this, &next, &inode);
if (err)
break;
- if (follow_on_final(inode, lookup_flags)) {
+ if (inode && unlikely(inode->i_op->follow_link) &&
+ (lookup_flags & LOOKUP_FOLLOW)) {
if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
return -ECHILD;
BUG_ON(inode != next.dentry->d_inode);
@@ -1950,8 +2133,9 @@ int may_open(struct path *path, int acc_mode, int flag)
return break_lease(inode, flag);
}
-static int handle_truncate(struct path *path)
+static int handle_truncate(struct file *filp)
{
+ struct path *path = &filp->f_path;
struct inode *inode = path->dentry->d_inode;
int error = get_write_access(inode);
if (error)
@@ -1965,7 +2149,7 @@ static int handle_truncate(struct path *path)
if (!error) {
error = do_truncate(path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
- NULL);
+ filp);
}
put_write_access(inode);
return error;
@@ -2063,7 +2247,7 @@ static struct file *finish_open(struct nameidata *nd,
}
if (!IS_ERR(filp)) {
if (will_truncate) {
- error = handle_truncate(&nd->path);
+ error = handle_truncate(filp);
if (error) {
fput(filp);
filp = ERR_PTR(error);
@@ -2104,11 +2288,13 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
dir = nd->path.dentry;
case LAST_DOT:
if (need_reval_dot(dir)) {
- error = d_revalidate(nd->path.dentry, nd);
- if (!error)
- error = -ESTALE;
- if (error < 0)
+ int status = d_revalidate(nd->path.dentry, nd);
+ if (!status)
+ status = -ESTALE;
+ if (status < 0) {
+ error = status;
goto exit;
+ }
}
/* fallthrough */
case LAST_ROOT:
@@ -2178,11 +2364,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (open_flag & O_EXCL)
goto exit_dput;
- if (__follow_mount(path)) {
- error = -ELOOP;
- if (open_flag & O_NOFOLLOW)
- goto exit_dput;
- }
+ error = follow_managed(path, nd->flags);
+ if (error < 0)
+ goto exit_dput;
error = -ENOENT;
if (!path->dentry->d_inode)
@@ -2327,11 +2511,11 @@ reval:
nd.flags = flags;
filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
while (unlikely(!filp)) { /* trailing symlink */
- struct path holder;
+ struct path link = path;
+ struct inode *linki = link.dentry->d_inode;
void *cookie;
error = -ELOOP;
- /* S_ISDIR part is a temporary automount kludge */
- if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(nd.inode->i_mode))
+ if (!(nd.flags & LOOKUP_FOLLOW))
goto exit_dput;
if (count++ == 32)
goto exit_dput;
@@ -2347,23 +2531,22 @@ reval:
* just set LAST_BIND.
*/
nd.flags |= LOOKUP_PARENT;
- error = security_inode_follow_link(path.dentry, &nd);
+ error = security_inode_follow_link(link.dentry, &nd);
if (error)
goto exit_dput;
- error = __do_follow_link(&path, &nd, &cookie);
+ error = __do_follow_link(&link, &nd, &cookie);
if (unlikely(error)) {
- if (!IS_ERR(cookie) && nd.inode->i_op->put_link)
- nd.inode->i_op->put_link(path.dentry, &nd, cookie);
+ if (!IS_ERR(cookie) && linki->i_op->put_link)
+ linki->i_op->put_link(link.dentry, &nd, cookie);
/* nd.path had been dropped */
- nd.path = path;
+ nd.path = link;
goto out_path;
}
- holder = path;
nd.flags &= ~LOOKUP_PARENT;
filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
- if (nd.inode->i_op->put_link)
- nd.inode->i_op->put_link(holder.dentry, &nd, cookie);
- path_put(&holder);
+ if (linki->i_op->put_link)
+ linki->i_op->put_link(link.dentry, &nd, cookie);
+ path_put(&link);
}
out:
if (nd.root.mnt)
@@ -3391,6 +3574,7 @@ const struct inode_operations page_symlink_inode_operations = {
};
EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(follow_down_one);
EXPORT_SYMBOL(follow_down);
EXPORT_SYMBOL(follow_up);
EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c44..7b0b95371696 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -183,7 +183,7 @@ static inline void mnt_dec_count(struct vfsmount *mnt)
unsigned int mnt_get_count(struct vfsmount *mnt)
{
#ifdef CONFIG_SMP
- unsigned int count = atomic_read(&mnt->mnt_longrefs);
+ unsigned int count = 0;
int cpu;
for_each_possible_cpu(cpu) {
@@ -217,7 +217,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
if (!mnt->mnt_pcp)
goto out_free_devname;
- atomic_set(&mnt->mnt_longrefs, 1);
+ this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
@@ -611,6 +611,21 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
}
+static inline void __mnt_make_longterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&mnt->mnt_longterm);
+#endif
+}
+
+/* needs vfsmount lock for write */
+static inline void __mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&mnt->mnt_longterm);
+#endif
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -624,8 +639,11 @@ static void commit_tree(struct vfsmount *mnt)
BUG_ON(parent == mnt);
list_add_tail(&head, &mnt->mnt_list);
- list_for_each_entry(m, &head, mnt_list)
+ list_for_each_entry(m, &head, mnt_list) {
m->mnt_ns = n;
+ __mnt_make_longterm(m);
+ }
+
list_splice(&head, n->list.prev);
list_add_tail(&mnt->mnt_hash, mount_hashtable +
@@ -734,51 +752,30 @@ static inline void mntfree(struct vfsmount *mnt)
deactivate_super(sb);
}
-#ifdef CONFIG_SMP
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
+static void mntput_no_expire(struct vfsmount *mnt)
{
- if (!longrefs) {
put_again:
- br_read_lock(vfsmount_lock);
- if (likely(atomic_read(&mnt->mnt_longrefs))) {
- mnt_dec_count(mnt);
- br_read_unlock(vfsmount_lock);
- return;
- }
+#ifdef CONFIG_SMP
+ br_read_lock(vfsmount_lock);
+ if (likely(atomic_read(&mnt->mnt_longterm))) {
+ mnt_dec_count(mnt);
br_read_unlock(vfsmount_lock);
- } else {
- BUG_ON(!atomic_read(&mnt->mnt_longrefs));
- if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
- return;
+ return;
}
+ br_read_unlock(vfsmount_lock);
br_write_lock(vfsmount_lock);
- if (!longrefs)
- mnt_dec_count(mnt);
- else
- atomic_dec(&mnt->mnt_longrefs);
+ mnt_dec_count(mnt);
if (mnt_get_count(mnt)) {
br_write_unlock(vfsmount_lock);
return;
}
- if (unlikely(mnt->mnt_pinned)) {
- mnt_add_count(mnt, mnt->mnt_pinned + 1);
- mnt->mnt_pinned = 0;
- br_write_unlock(vfsmount_lock);
- acct_auto_close_mnt(mnt);
- goto put_again;
- }
- br_write_unlock(vfsmount_lock);
- mntfree(mnt);
-}
#else
-static inline void __mntput(struct vfsmount *mnt, int longrefs)
-{
-put_again:
mnt_dec_count(mnt);
if (likely(mnt_get_count(mnt)))
return;
br_write_lock(vfsmount_lock);
+#endif
if (unlikely(mnt->mnt_pinned)) {
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
@@ -789,12 +786,6 @@ put_again:
br_write_unlock(vfsmount_lock);
mntfree(mnt);
}
-#endif
-
-static void mntput_no_expire(struct vfsmount *mnt)
-{
- __mntput(mnt, 0);
-}
void mntput(struct vfsmount *mnt)
{
@@ -802,7 +793,7 @@ void mntput(struct vfsmount *mnt)
/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
if (unlikely(mnt->mnt_expiry_mark))
mnt->mnt_expiry_mark = 0;
- __mntput(mnt, 0);
+ mntput_no_expire(mnt);
}
}
EXPORT_SYMBOL(mntput);
@@ -815,33 +806,6 @@ struct vfsmount *mntget(struct vfsmount *mnt)
}
EXPORT_SYMBOL(mntget);
-void mntput_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
- if (mnt) {
- /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
- if (unlikely(mnt->mnt_expiry_mark))
- mnt->mnt_expiry_mark = 0;
- __mntput(mnt, 1);
- }
-#else
- mntput(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntput_long);
-
-struct vfsmount *mntget_long(struct vfsmount *mnt)
-{
-#ifdef CONFIG_SMP
- if (mnt)
- atomic_inc(&mnt->mnt_longrefs);
- return mnt;
-#else
- return mntget(mnt);
-#endif
-}
-EXPORT_SYMBOL(mntget_long);
-
void mnt_pin(struct vfsmount *mnt)
{
br_write_lock(vfsmount_lock);
@@ -1216,7 +1180,7 @@ void release_mounts(struct list_head *head)
dput(dentry);
mntput(m);
}
- mntput_long(mnt);
+ mntput(mnt);
}
}
@@ -1226,19 +1190,21 @@ void release_mounts(struct list_head *head)
*/
void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
{
+ LIST_HEAD(tmp_list);
struct vfsmount *p;
for (p = mnt; p; p = next_mnt(p, mnt))
- list_move(&p->mnt_hash, kill);
+ list_move(&p->mnt_hash, &tmp_list);
if (propagate)
- propagate_umount(kill);
+ propagate_umount(&tmp_list);
- list_for_each_entry(p, kill, mnt_hash) {
+ list_for_each_entry(p, &tmp_list, mnt_hash) {
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
+ __mnt_make_shortterm(p);
list_del_init(&p->mnt_child);
if (p->mnt_parent != p) {
p->mnt_parent->mnt_ghosts++;
@@ -1246,6 +1212,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
}
change_mnt_propagation(p, MS_PRIVATE);
}
+ list_splice(&tmp_list, kill);
}
static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
@@ -1844,9 +1811,10 @@ static int do_move_mount(struct path *path, char *old_name)
return err;
down_write(&namespace_sem);
- while (d_mountpoint(path->dentry) &&
- follow_down(path))
- ;
+ err = follow_down(path, true);
+ if (err < 0)
+ goto out;
+
err = -EINVAL;
if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
goto out;
@@ -1904,6 +1872,8 @@ out:
return err;
}
+static int do_add_mount(struct vfsmount *, struct path *, int);
+
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
@@ -1912,6 +1882,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;
+ int err;
if (!type)
return -EINVAL;
@@ -1924,15 +1895,47 @@ static int do_new_mount(struct path *path, char *type, int flags,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
- return do_add_mount(mnt, path, mnt_flags, NULL);
+ err = do_add_mount(mnt, path, mnt_flags);
+ if (err)
+ mntput(mnt);
+ return err;
+}
+
+int finish_automount(struct vfsmount *m, struct path *path)
+{
+ int err;
+ /* The new mount record should have at least 2 refs to prevent it being
+ * expired before we get a chance to add it
+ */
+ BUG_ON(mnt_get_count(m) < 2);
+
+ if (m->mnt_sb == path->mnt->mnt_sb &&
+ m->mnt_root == path->dentry) {
+ err = -ELOOP;
+ goto fail;
+ }
+
+ err = do_add_mount(m, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+ if (!err)
+ return 0;
+fail:
+ /* remove m from any expiration list it may be on */
+ if (!list_empty(&m->mnt_expire)) {
+ down_write(&namespace_sem);
+ br_write_lock(vfsmount_lock);
+ list_del_init(&m->mnt_expire);
+ br_write_unlock(vfsmount_lock);
+ up_write(&namespace_sem);
+ }
+ mntput(m);
+ mntput(m);
+ return err;
}
/*
* add a mount into a namespace's mount tree
- * - provide the option of adding the new mount to an expiration list
*/
-int do_add_mount(struct vfsmount *newmnt, struct path *path,
- int mnt_flags, struct list_head *fslist)
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
{
int err;
@@ -1940,9 +1943,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
down_write(&namespace_sem);
/* Something was mounted here while we slept */
- while (d_mountpoint(path->dentry) &&
- follow_down(path))
- ;
+ err = follow_down(path, true);
+ if (err < 0)
+ goto unlock;
+
err = -EINVAL;
if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
goto unlock;
@@ -1958,22 +1962,29 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
goto unlock;
newmnt->mnt_flags = mnt_flags;
- if ((err = graft_tree(newmnt, path)))
- goto unlock;
-
- if (fslist) /* add to the specified expiration list */
- list_add_tail(&newmnt->mnt_expire, fslist);
-
- up_write(&namespace_sem);
- return 0;
+ err = graft_tree(newmnt, path);
unlock:
up_write(&namespace_sem);
- mntput_long(newmnt);
return err;
}
-EXPORT_SYMBOL_GPL(do_add_mount);
+/**
+ * mnt_set_expiry - Put a mount on an expiration list
+ * @mnt: The mount to list.
+ * @expiry_list: The list to add the mount to.
+ */
+void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
+{
+ down_write(&namespace_sem);
+ br_write_lock(vfsmount_lock);
+
+ list_add_tail(&mnt->mnt_expire, expiry_list);
+
+ br_write_unlock(vfsmount_lock);
+ up_write(&namespace_sem);
+}
+EXPORT_SYMBOL(mnt_set_expiry);
/*
* process a list of expirable mountpoints with the intent of discarding any
@@ -2262,6 +2273,22 @@ static struct mnt_namespace *alloc_mnt_ns(void)
return new_ns;
}
+void mnt_make_longterm(struct vfsmount *mnt)
+{
+ __mnt_make_longterm(mnt);
+}
+
+void mnt_make_shortterm(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
+ return;
+ br_write_lock(vfsmount_lock);
+ atomic_dec(&mnt->mnt_longterm);
+ br_write_unlock(vfsmount_lock);
+#endif
+}
+
/*
* Allocate a new namespace structure and populate it with contents
* copied from the namespace of the passed in task structure.
@@ -2299,14 +2326,19 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
q = new_ns->root;
while (p) {
q->mnt_ns = new_ns;
+ __mnt_make_longterm(q);
if (fs) {
if (p == fs->root.mnt) {
+ fs->root.mnt = mntget(q);
+ __mnt_make_longterm(q);
+ mnt_make_shortterm(p);
rootmnt = p;
- fs->root.mnt = mntget_long(q);
}
if (p == fs->pwd.mnt) {
+ fs->pwd.mnt = mntget(q);
+ __mnt_make_longterm(q);
+ mnt_make_shortterm(p);
pwdmnt = p;
- fs->pwd.mnt = mntget_long(q);
}
}
p = next_mnt(p, mnt_ns->root);
@@ -2315,9 +2347,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
up_write(&namespace_sem);
if (rootmnt)
- mntput_long(rootmnt);
+ mntput(rootmnt);
if (pwdmnt)
- mntput_long(pwdmnt);
+ mntput(pwdmnt);
return new_ns;
}
@@ -2350,6 +2382,7 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt)
new_ns = alloc_mnt_ns();
if (!IS_ERR(new_ns)) {
mnt->mnt_ns = new_ns;
+ __mnt_make_longterm(mnt);
new_ns->root = mnt;
list_add(&new_ns->list, &new_ns->root->mnt_list);
}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 28f136d4aaec..f6946bb5cb55 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -21,9 +21,7 @@
#include <asm/uaccess.h>
#include <asm/byteorder.h>
-#include <linux/ncp_fs.h>
-
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
static void ncp_read_volume_list(struct file *, void *, filldir_t,
struct ncp_cache_control *);
@@ -82,7 +80,7 @@ static int ncp_compare_dentry(const struct dentry *, const struct inode *,
unsigned int, const char *, const struct qstr *);
static int ncp_delete_dentry(const struct dentry *);
-static const struct dentry_operations ncp_dentry_operations =
+const struct dentry_operations ncp_dentry_operations =
{
.d_revalidate = ncp_lookup_validate,
.d_hash = ncp_hash_dentry,
@@ -90,14 +88,6 @@ static const struct dentry_operations ncp_dentry_operations =
.d_delete = ncp_delete_dentry,
};
-const struct dentry_operations ncp_root_dentry_operations =
-{
- .d_hash = ncp_hash_dentry,
- .d_compare = ncp_compare_dentry,
- .d_delete = ncp_delete_dentry,
-};
-
-
#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
static inline int ncp_preserve_entry_case(struct inode *i, __u32 nscreator)
@@ -309,6 +299,9 @@ ncp_lookup_validate(struct dentry *dentry, struct nameidata *nd)
int res, val = 0, len;
__u8 __name[NCP_MAXPATHLEN + 1];
+ if (dentry == dentry->d_sb->s_root)
+ return 1;
+
if (nd->flags & LOOKUP_RCU)
return -ECHILD;
@@ -637,7 +630,6 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
entry->ino = iunique(dir->i_sb, 2);
inode = ncp_iget(dir->i_sb, entry);
if (inode) {
- d_set_d_op(newdent, &ncp_dentry_operations);
d_instantiate(newdent, inode);
if (!hashed)
d_rehash(newdent);
@@ -893,7 +885,6 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struc
if (inode) {
ncp_new_dentry(dentry);
add_entry:
- d_set_d_op(dentry, &ncp_dentry_operations);
d_add(dentry, inode);
error = 0;
}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index cb50aaf981df..0ed65e0c3dfe 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,8 +18,7 @@
#include <linux/vmalloc.h>
#include <linux/sched.h>
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
static int ncp_fsync(struct file *file, int datasync)
{
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9b39a5dd4131..00a1d1c3d3a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -31,11 +31,9 @@
#include <linux/seq_file.h>
#include <linux/namei.h>
-#include <linux/ncp_fs.h>
-
#include <net/sock.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
#include "getopt.h"
#define NCP_DEFAULT_FILE_MODE 0600
@@ -544,6 +542,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
sb->s_blocksize_bits = 10;
sb->s_magic = NCP_SUPER_MAGIC;
sb->s_op = &ncp_sops;
+ sb->s_d_op = &ncp_dentry_operations;
sb->s_bdi = &server->bdi;
server = NCP_SBP(sb);
@@ -723,7 +722,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
sb->s_root = d_alloc_root(root_inode);
if (!sb->s_root)
goto out_no_root;
- d_set_d_op(sb->s_root, &ncp_root_dentry_operations);
return 0;
out_no_root:
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index d40a547e3377..790e92a9ec63 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,11 +20,9 @@
#include <linux/vmalloc.h>
#include <linux/sched.h>
-#include <linux/ncp_fs.h>
-
#include <asm/uaccess.h>
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
/* maximum limit for ncp_objectname_ioctl */
#define NCP_OBJECT_NAME_MAX_LEN 4096
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 56f5b3a0e1ee..a7c07b44b100 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -16,12 +16,12 @@
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/fcntl.h>
-#include <linux/ncp_fs.h>
-#include "ncplib_kernel.h"
#include <asm/uaccess.h>
#include <asm/system.h>
+#include "ncp_fs.h"
+
/*
* Fill in the supplied page for mmap
* XXX: how are we excluding truncate/invalidate here? Maybe need to lock
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
new file mode 100644
index 000000000000..31831afe1c3b
--- /dev/null
+++ b/fs/ncpfs/ncp_fs.h
@@ -0,0 +1,98 @@
+#include <linux/ncp_fs.h>
+#include "ncp_fs_i.h"
+#include "ncp_fs_sb.h"
+
+/* define because it is easy to change PRINTK to {*}PRINTK */
+#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
+
+#undef NCPFS_PARANOIA
+#ifdef NCPFS_PARANOIA
+#define PPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define PPRINTK(format, args...)
+#endif
+
+#ifndef DEBUG_NCP
+#define DEBUG_NCP 0
+#endif
+#if DEBUG_NCP > 0
+#define DPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DPRINTK(format, args...)
+#endif
+#if DEBUG_NCP > 1
+#define DDPRINTK(format, args...) PRINTK(format , ## args)
+#else
+#define DDPRINTK(format, args...)
+#endif
+
+#define NCP_MAX_RPC_TIMEOUT (6*HZ)
+
+
+struct ncp_entry_info {
+ struct nw_info_struct i;
+ ino_t ino;
+ int opened;
+ int access;
+ unsigned int volume;
+ __u8 file_handle[6];
+};
+
+static inline struct ncp_server *NCP_SBP(const struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+#define NCP_SERVER(inode) NCP_SBP((inode)->i_sb)
+static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode)
+{
+ return container_of(inode, struct ncp_inode_info, vfs_inode);
+}
+
+/* linux/fs/ncpfs/inode.c */
+int ncp_notify_change(struct dentry *, struct iattr *);
+struct inode *ncp_iget(struct super_block *, struct ncp_entry_info *);
+void ncp_update_inode(struct inode *, struct ncp_entry_info *);
+void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
+
+/* linux/fs/ncpfs/dir.c */
+extern const struct inode_operations ncp_dir_inode_operations;
+extern const struct file_operations ncp_dir_operations;
+extern const struct dentry_operations ncp_dentry_operations;
+int ncp_conn_logged_in(struct super_block *);
+int ncp_date_dos2unix(__le16 time, __le16 date);
+void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
+
+/* linux/fs/ncpfs/ioctl.c */
+long ncp_ioctl(struct file *, unsigned int, unsigned long);
+long ncp_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* linux/fs/ncpfs/sock.c */
+int ncp_request2(struct ncp_server *server, int function,
+ void* reply, int max_reply_size);
+static inline int ncp_request(struct ncp_server *server, int function) {
+ return ncp_request2(server, function, server->packet, server->packet_size);
+}
+int ncp_connect(struct ncp_server *server);
+int ncp_disconnect(struct ncp_server *server);
+void ncp_lock_server(struct ncp_server *server);
+void ncp_unlock_server(struct ncp_server *server);
+
+/* linux/fs/ncpfs/symlink.c */
+#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
+extern const struct address_space_operations ncp_symlink_aops;
+int ncp_symlink(struct inode*, struct dentry*, const char*);
+#endif
+
+/* linux/fs/ncpfs/file.c */
+extern const struct inode_operations ncp_file_inode_operations;
+extern const struct file_operations ncp_file_operations;
+int ncp_make_open(struct inode *, int);
+
+/* linux/fs/ncpfs/mmap.c */
+int ncp_mmap(struct file *, struct vm_area_struct *);
+
+/* linux/fs/ncpfs/ncplib_kernel.c */
+int ncp_make_closed(struct inode *);
+
+#include "ncplib_kernel.h"
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
new file mode 100644
index 000000000000..4b0bec477846
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -0,0 +1,29 @@
+/*
+ * ncp_fs_i.h
+ *
+ * Copyright (C) 1995 Volker Lendecke
+ *
+ */
+
+#ifndef _LINUX_NCP_FS_I
+#define _LINUX_NCP_FS_I
+
+/*
+ * This is the ncpfs part of the inode structure. This must contain
+ * all the information we need to work with an inode after creation.
+ */
+struct ncp_inode_info {
+ __le32 dirEntNum;
+ __le32 DosDirNum;
+ __u8 volNumber;
+ __le32 nwattr;
+ struct mutex open_mutex;
+ atomic_t opened;
+ int access;
+ int flags;
+#define NCPI_KLUDGE_SYMLINK 0x0001
+ __u8 file_handle[6];
+ struct inode vfs_inode;
+};
+
+#endif /* _LINUX_NCP_FS_I */
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
new file mode 100644
index 000000000000..4af803f13516
--- /dev/null
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -0,0 +1,176 @@
+/*
+ * ncp_fs_sb.h
+ *
+ * Copyright (C) 1995, 1996 by Volker Lendecke
+ *
+ */
+
+#ifndef _NCP_FS_SB
+#define _NCP_FS_SB
+
+#include <linux/types.h>
+#include <linux/ncp_mount.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/backing-dev.h>
+#include <linux/workqueue.h>
+
+#define NCP_DEFAULT_OPTIONS 0 /* 2 for packet signatures */
+
+struct sock;
+
+struct ncp_mount_data_kernel {
+ unsigned long flags; /* NCP_MOUNT_* flags */
+ unsigned int int_flags; /* internal flags */
+#define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001
+ __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */
+ struct pid *wdog_pid; /* Who cares for our watchdog packets? */
+ unsigned int ncp_fd; /* The socket to the ncp port */
+ unsigned int time_out; /* How long should I wait after
+ sending a NCP request? */
+ unsigned int retry_count; /* And how often should I retry? */
+ unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
+ __kernel_uid32_t uid;
+ __kernel_gid32_t gid;
+ __kernel_mode_t file_mode;
+ __kernel_mode_t dir_mode;
+ int info_fd;
+};
+
+struct ncp_server {
+
+ struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
+ interest for us later, so we store
+ it completely. */
+
+ __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
+
+ struct file *ncp_filp; /* File pointer to ncp socket */
+ struct socket *ncp_sock;/* ncp socket */
+ struct file *info_filp;
+ struct socket *info_sock;
+
+ u8 sequence;
+ u8 task;
+ u16 connection; /* Remote connection number */
+
+ u8 completion; /* Status message from server */
+ u8 conn_status; /* Bit 4 = 1 ==> Server going down, no
+ requests allowed anymore.
+ Bit 0 = 1 ==> Server is down. */
+
+ int buffer_size; /* Negotiated bufsize */
+
+ int reply_size; /* Size of last reply */
+
+ int packet_size;
+ unsigned char *packet; /* Here we prepare requests and
+ receive replies */
+ unsigned char *txbuf; /* Storage for current request */
+ unsigned char *rxbuf; /* Storage for reply to current request */
+
+ int lock; /* To prevent mismatch in protocols. */
+ struct mutex mutex;
+
+ int current_size; /* for packet preparation */
+ int has_subfunction;
+ int ncp_reply_size;
+
+ int root_setuped;
+ struct mutex root_setup_lock;
+
+ /* info for packet signing */
+ int sign_wanted; /* 1=Server needs signed packets */
+ int sign_active; /* 0=don't do signing, 1=do */
+ char sign_root[8]; /* generated from password and encr. key */
+ char sign_last[16];
+
+ /* Authentication info: NDS or BINDERY, username */
+ struct {
+ int auth_type;
+ size_t object_name_len;
+ void* object_name;
+ int object_type;
+ } auth;
+ /* Password info */
+ struct {
+ size_t len;
+ void* data;
+ } priv;
+ struct rw_semaphore auth_rwsem;
+
+ /* nls info: codepage for volume and charset for I/O */
+ struct nls_table *nls_vol;
+ struct nls_table *nls_io;
+
+ /* maximum age in jiffies */
+ atomic_t dentry_ttl;
+
+ /* miscellaneous */
+ unsigned int flags;
+
+ spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
+
+ void (*data_ready)(struct sock* sk, int len);
+ void (*error_report)(struct sock* sk);
+ void (*write_space)(struct sock* sk); /* STREAM mode only */
+ struct {
+ struct work_struct tq; /* STREAM/DGRAM: data/error ready */
+ struct ncp_request_reply* creq; /* STREAM/DGRAM: awaiting reply from this request */
+ struct mutex creq_mutex; /* DGRAM only: lock accesses to rcv.creq */
+
+ unsigned int state; /* STREAM only: receiver state */
+ struct {
+ __u32 magic __packed;
+ __u32 len __packed;
+ __u16 type __packed;
+ __u16 p1 __packed;
+ __u16 p2 __packed;
+ __u16 p3 __packed;
+ __u16 type2 __packed;
+ } buf; /* STREAM only: temporary buffer */
+ unsigned char* ptr; /* STREAM only: pointer to data */
+ size_t len; /* STREAM only: length of data to receive */
+ } rcv;
+ struct {
+ struct list_head requests; /* STREAM only: queued requests */
+ struct work_struct tq; /* STREAM only: transmitter ready */
+ struct ncp_request_reply* creq; /* STREAM only: currently transmitted entry */
+ } tx;
+ struct timer_list timeout_tm; /* DGRAM only: timeout timer */
+ struct work_struct timeout_tq; /* DGRAM only: associated queue, we run timers from process context */
+ int timeout_last; /* DGRAM only: current timeout length */
+ int timeout_retries; /* DGRAM only: retries left */
+ struct {
+ size_t len;
+ __u8 data[128];
+ } unexpected_packet;
+ struct backing_dev_info bdi;
+};
+
+extern void ncp_tcp_rcv_proc(struct work_struct *work);
+extern void ncp_tcp_tx_proc(struct work_struct *work);
+extern void ncpdgram_rcv_proc(struct work_struct *work);
+extern void ncpdgram_timeout_proc(struct work_struct *work);
+extern void ncpdgram_timeout_call(unsigned long server);
+extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_write_space(struct sock* sk);
+extern void ncp_tcp_error_report(struct sock* sk);
+
+#define NCP_FLAG_UTF8 1
+
+#define NCP_CLR_FLAG(server, flag) ((server)->flags &= ~(flag))
+#define NCP_SET_FLAG(server, flag) ((server)->flags |= (flag))
+#define NCP_IS_FLAG(server, flag) ((server)->flags & (flag))
+
+static inline int ncp_conn_valid(struct ncp_server *server)
+{
+ return ((server->conn_status & 0x11) == 0);
+}
+
+static inline void ncp_invalidate_conn(struct ncp_server *server)
+{
+ server->conn_status |= 0x01;
+}
+
+#endif
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index a95615a0b6ac..981a95617fc9 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -11,7 +11,7 @@
-#include "ncplib_kernel.h"
+#include "ncp_fs.h"
static inline void assert_server_locked(struct ncp_server *server)
{
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 1220df75ff22..09881e6aa5ad 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -32,8 +32,6 @@
#include <linux/ctype.h>
#endif /* CONFIG_NCPFS_NLS */
-#include <linux/ncp_fs.h>
-
#define NCP_MIN_SYMLINK_SIZE 8
#define NCP_MAX_SYMLINK_SIZE 512
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index d8b2d7e6910b..08907599dcd2 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -11,6 +11,7 @@
#include <linux/string.h>
#include <linux/ncp.h>
#include <linux/bitops.h>
+#include "ncp_fs.h"
#include "ncpsign_kernel.h"
/* i386: 32-bit, little endian, handles mis-alignment */
diff --git a/fs/ncpfs/ncpsign_kernel.h b/fs/ncpfs/ncpsign_kernel.h
index 6451a68381cc..d9a1438bb1f6 100644
--- a/fs/ncpfs/ncpsign_kernel.h
+++ b/fs/ncpfs/ncpsign_kernel.h
@@ -8,8 +8,6 @@
#ifndef _NCPSIGN_KERNEL_H
#define _NCPSIGN_KERNEL_H
-#include <linux/ncp_fs.h>
-
#ifdef CONFIG_NCPFS_PACKET_SIGNING
void __sign_packet(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, void *sign_buff);
int sign_verify_reply(struct ncp_server *server, const char *data, size_t size, __u32 totalsize, const void *sign_buff);
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 668bd267346e..3a1587222c8a 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -28,7 +28,7 @@
#include <linux/poll.h>
#include <linux/file.h>
-#include <linux/ncp_fs.h>
+#include "ncp_fs.h"
#include "ncpsign_kernel.h"
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index c634fd17b337..661f861d80c6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -25,13 +25,11 @@
#include <linux/errno.h>
#include <linux/fs.h>
-#include <linux/ncp_fs.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/stat.h>
-#include "ncplib_kernel.h"
-
+#include "ncp_fs.h"
/* these magic numbers must appear in the symlink file -- this makes it a bit
more resilient against the magic attributes being set on random files. */
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index abe4f0c8dc5f..2c3eb33b904d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -439,7 +439,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (dentry == NULL)
return;
- d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
if (IS_ERR(inode))
goto out;
@@ -971,7 +970,7 @@ int nfs_lookup_verify_inode(struct inode *inode, struct nameidata *nd)
{
struct nfs_server *server = NFS_SERVER(inode);
- if (test_bit(NFS_INO_MOUNTPOINT, &NFS_I(inode)->flags))
+ if (IS_AUTOMOUNT(inode))
return 0;
if (nd != NULL) {
/* VFS wants an on-the-wire revalidation */
@@ -1174,6 +1173,7 @@ const struct dentry_operations nfs_dentry_operations = {
.d_revalidate = nfs_lookup_revalidate,
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
};
static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1193,8 +1193,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
goto out;
- d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
-
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
@@ -1249,6 +1247,7 @@ const struct dentry_operations nfs4_dentry_operations = {
.d_revalidate = nfs_open_revalidate,
.d_delete = nfs_dentry_delete,
.d_iput = nfs_dentry_iput,
+ .d_automount = nfs_d_automount,
};
/*
@@ -1338,7 +1337,6 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
res = ERR_PTR(-ENAMETOOLONG);
goto out;
}
- d_set_d_op(dentry, NFS_PROTO(dir)->dentry_ops);
/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
* the dentry. */
@@ -1410,11 +1408,15 @@ no_open:
static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct dentry *parent = NULL;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode;
struct inode *dir;
struct nfs_open_context *ctx;
int openflags, ret = 0;
+ if (nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ inode = dentry->d_inode;
if (!is_atomic_open(nd) || d_mountpoint(dentry))
goto no_open;
@@ -1583,6 +1585,7 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
{
struct iattr attr;
int error;
+ int open_flags = 0;
dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
@@ -1590,7 +1593,10 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
- error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
+ if ((nd->flags & LOOKUP_CREATE) != 0)
+ open_flags = nd->intent.open.flags;
+
+ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
if (error != 0)
goto out_err;
return 0;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 5596c6a2881e..b5ffe8fa291f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -119,9 +119,6 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
}
security_d_instantiate(ret, inode);
-
- if (ret->d_op == NULL)
- d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
out:
nfs_free_fattr(fsinfo.fattr);
return ret;
@@ -227,9 +224,6 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
security_d_instantiate(ret, inode);
- if (ret->d_op == NULL)
- d_set_d_op(ret, server->nfs_client->rpc_ops->dentry_ops);
-
out:
nfs_free_fattr(fattr);
dprintk("<-- nfs4_get_root()\n");
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ce00b704452c..d8512423ba72 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -300,7 +300,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
else
inode->i_op = &nfs_mountpoint_inode_operations;
inode->i_fop = NULL;
- set_bit(NFS_INO_MOUNTPOINT, &nfsi->flags);
+ inode->i_flags |= S_AUTOMOUNT;
}
} else if (S_ISLNK(inode->i_mode))
inode->i_op = &nfs_symlink_inode_operations;
@@ -1208,7 +1208,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Update the fsid? */
if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
- !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
+ !IS_AUTOMOUNT(inode))
server->fsid = fattr->fsid;
/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bfa3a34af801..4644f04b4b46 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -252,6 +252,7 @@ extern char *nfs_path(const char *base,
const struct dentry *droot,
const struct dentry *dentry,
char *buffer, ssize_t buflen);
+extern struct vfsmount *nfs_d_automount(struct path *path);
/* getroot.c */
extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 74aaf3963c10..f32b8603dca8 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -97,9 +97,8 @@ Elong:
}
/*
- * nfs_follow_mountpoint - handle crossing a mountpoint on the server
- * @dentry - dentry of mountpoint
- * @nd - nameidata info
+ * nfs_d_automount - Handle crossing a mountpoint on the server
+ * @path - The mountpoint
*
* When we encounter a mountpoint on the server, we want to set up
* a mountpoint on the client too, to prevent inode numbers from
@@ -109,87 +108,65 @@ Elong:
* situation, and that different filesystems may want to use
* different security flavours.
*/
-static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+struct vfsmount *nfs_d_automount(struct path *path)
{
struct vfsmount *mnt;
- struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+ struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
struct dentry *parent;
struct nfs_fh *fh = NULL;
struct nfs_fattr *fattr = NULL;
int err;
- dprintk("--> nfs_follow_mountpoint()\n");
+ dprintk("--> nfs_d_automount()\n");
- err = -ESTALE;
- if (IS_ROOT(dentry))
- goto out_err;
+ mnt = ERR_PTR(-ESTALE);
+ if (IS_ROOT(path->dentry))
+ goto out_nofree;
- err = -ENOMEM;
+ mnt = ERR_PTR(-ENOMEM);
fh = nfs_alloc_fhandle();
fattr = nfs_alloc_fattr();
if (fh == NULL || fattr == NULL)
- goto out_err;
+ goto out;
dprintk("%s: enter\n", __func__);
- dput(nd->path.dentry);
- nd->path.dentry = dget(dentry);
- /* Look it up again */
- parent = dget_parent(nd->path.dentry);
+ /* Look it up again to get its attributes */
+ parent = dget_parent(path->dentry);
err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
- &nd->path.dentry->d_name,
+ &path->dentry->d_name,
fh, fattr);
dput(parent);
- if (err != 0)
- goto out_err;
+ if (err != 0) {
+ mnt = ERR_PTR(err);
+ goto out;
+ }
if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
- mnt = nfs_do_refmount(nd->path.mnt, nd->path.dentry);
+ mnt = nfs_do_refmount(path->mnt, path->dentry);
else
- mnt = nfs_do_submount(nd->path.mnt, nd->path.dentry, fh,
- fattr);
- err = PTR_ERR(mnt);
+ mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
if (IS_ERR(mnt))
- goto out_err;
+ goto out;
- mntget(mnt);
- err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
- &nfs_automount_list);
- if (err < 0) {
- mntput(mnt);
- if (err == -EBUSY)
- goto out_follow;
- goto out_err;
- }
- path_put(&nd->path);
- nd->path.mnt = mnt;
- nd->path.dentry = dget(mnt->mnt_root);
+ dprintk("%s: done, success\n", __func__);
+ mntget(mnt); /* prevent immediate expiration */
+ mnt_set_expiry(mnt, &nfs_automount_list);
schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fh);
- dprintk("%s: done, returned %d\n", __func__, err);
-
- dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
- return ERR_PTR(err);
-out_err:
- path_put(&nd->path);
- goto out;
-out_follow:
- while (d_mountpoint(nd->path.dentry) &&
- follow_down(&nd->path))
- ;
- err = 0;
- goto out;
+out_nofree:
+ dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt);
+ return mnt;
}
const struct inode_operations nfs_mountpoint_inode_operations = {
- .follow_link = nfs_follow_mountpoint,
.getattr = nfs_getattr,
};
const struct inode_operations nfs_referral_inode_operations = {
- .follow_link = nfs_follow_mountpoint,
};
static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0f9ea73e7789..b68c8607770f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2202,6 +2202,7 @@ static int nfs_set_super(struct super_block *s, void *data)
s->s_flags = sb_mntdata->mntflags;
s->s_fs_info = server;
+ s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
ret = set_anon_super(s, server);
if (ret == 0)
server->s_dev = s->s_dev;
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
new file mode 100644
index 000000000000..34e5c40af5ef
--- /dev/null
+++ b/fs/nfsd/acl.h
@@ -0,0 +1,59 @@
+/*
+ * Common NFSv4 ACL handling definitions.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFS4_ACL_H
+#define LINUX_NFS4_ACL_H
+
+#include <linux/posix_acl.h>
+
+/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+ * fit in a page: */
+#define NFS4_ACL_MAX 170
+
+struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_get_whotype(char *, u32);
+int nfs4_acl_write_who(int who, char *p);
+int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
+ uid_t who, u32 mask);
+
+#define NFS4_ACL_TYPE_DEFAULT 0x01
+#define NFS4_ACL_DIR 0x02
+#define NFS4_ACL_OWNER 0x04
+
+struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
+ struct posix_acl *, unsigned int flags);
+int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
+ struct posix_acl **, unsigned int flags);
+
+#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c0fcb7ab7f6d..8b31e5f8795d 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,4 +1,3 @@
-#define MSNFS /* HACK HACK */
/*
* NFS exporting and validation.
*
@@ -1444,9 +1443,6 @@ static struct flags {
{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
{ NFSEXP_V4ROOT, {"v4root", ""}},
-#ifdef MSNFS
- { NFSEXP_MSNFS, {"msnfs", ""}},
-#endif
{ 0, {"", ""}}
};
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
new file mode 100644
index 000000000000..2f3be1321534
--- /dev/null
+++ b/fs/nfsd/idmap.h
@@ -0,0 +1,62 @@
+/*
+ * Mapping of UID to name and vice versa.
+ *
+ * Copyright (c) 2002, 2003 The Regents of the University of
+ * Michigan. All rights reserved.
+> *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LINUX_NFSD_IDMAP_H
+#define LINUX_NFSD_IDMAP_H
+
+#include <linux/in.h>
+#include <linux/sunrpc/svc.h>
+
+/* XXX from linux/nfs_idmap.h */
+#define IDMAP_NAMESZ 128
+
+#ifdef CONFIG_NFSD_V4
+int nfsd_idmap_init(void);
+void nfsd_idmap_shutdown(void);
+#else
+static inline int nfsd_idmap_init(void)
+{
+ return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
+#endif
+
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
+int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+
+#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b7e3021e06b..2247fc91d5e9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -151,10 +151,10 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
__be32 nfserr;
u32 max_blocksize = svc_max_payload(rqstp);
- dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
+ dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
(unsigned long) argp->count,
- (unsigned long) argp->offset);
+ (unsigned long long) argp->offset);
/* Obtain buffer pointer for payload.
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
@@ -191,10 +191,10 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
__be32 nfserr;
unsigned long cnt = argp->len;
- dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
+ dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
SVCFH_fmt(&argp->fh),
argp->len,
- (unsigned long) argp->offset,
+ (unsigned long long) argp->offset,
argp->stable? " stable" : "");
fh_copy(&resp->fh, &argp->fh);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index e48052615159..ad88f1c0a4c3 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,7 @@
#include <linux/slab.h>
#include <linux/nfs_fs.h>
-#include <linux/nfs4_acl.h>
+#include "acl.h"
/* mode bit translations: */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 21a63da305ff..3be975e18919 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -628,10 +628,8 @@ static int max_cb_time(void)
return max(nfsd4_lease/10, (time_t)1) * HZ;
}
-/* Reference counting, callback cleanup, etc., all look racy as heck.
- * And why is cl_cb_set an atomic? */
-int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
struct rpc_timeout timeparms = {
.to_initval = max_cb_time(),
@@ -641,6 +639,7 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
.net = &init_net,
.address = (struct sockaddr *) &conn->cb_addr,
.addrsize = conn->cb_addrlen,
+ .saddress = (struct sockaddr *) &conn->cb_saddr,
.timeout = &timeparms,
.program = &cb_program,
.version = 0,
@@ -657,6 +656,10 @@ int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
args.protocol = XPRT_TRANSPORT_TCP;
clp->cl_cb_ident = conn->cb_ident;
} else {
+ if (!conn->cb_xprt)
+ return -EINVAL;
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+ clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
args.protocol = XPRT_TRANSPORT_BC_TCP;
@@ -679,14 +682,20 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
(int)clp->cl_name.len, clp->cl_name.data, reason);
}
+static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+{
+ clp->cl_cb_state = NFSD4_CB_DOWN;
+ warn_no_callback_path(clp, reason);
+}
+
static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
{
struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
if (task->tk_status)
- warn_no_callback_path(clp, task->tk_status);
+ nfsd4_mark_cb_down(clp, task->tk_status);
else
- atomic_set(&clp->cl_cb_set, 1);
+ clp->cl_cb_state = NFSD4_CB_UP;
}
static const struct rpc_call_ops nfsd4_cb_probe_ops = {
@@ -709,6 +718,11 @@ int set_callback_cred(void)
static struct workqueue_struct *callback_wq;
+static void run_nfsd4_cb(struct nfsd4_callback *cb)
+{
+ queue_work(callback_wq, &cb->cb_work);
+}
+
static void do_probe_callback(struct nfs4_client *clp)
{
struct nfsd4_callback *cb = &clp->cl_cb_null;
@@ -723,7 +737,7 @@ static void do_probe_callback(struct nfs4_client *clp)
cb->cb_ops = &nfsd4_cb_probe_ops;
- queue_work(callback_wq, &cb->cb_work);
+ run_nfsd4_cb(cb);
}
/*
@@ -732,14 +746,21 @@ static void do_probe_callback(struct nfs4_client *clp)
*/
void nfsd4_probe_callback(struct nfs4_client *clp)
{
+ /* XXX: atomicity? Also, should we be using cl_cb_flags? */
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
do_probe_callback(clp);
}
-void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+void nfsd4_probe_callback_sync(struct nfs4_client *clp)
{
- BUG_ON(atomic_read(&clp->cl_cb_set));
+ nfsd4_probe_callback(clp);
+ flush_workqueue(callback_wq);
+}
+void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+{
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
spin_lock(&clp->cl_lock);
memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
spin_unlock(&clp->cl_lock);
@@ -750,24 +771,14 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
* If the slot is available, then mark it busy. Otherwise, set the
* thread for sleeping on the callback RPC wait queue.
*/
-static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
- struct rpc_task *task)
+static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
{
- u32 *ptr = (u32 *)clp->cl_cb_session->se_sessionid.data;
- int status = 0;
-
- dprintk("%s: %u:%u:%u:%u\n", __func__,
- ptr[0], ptr[1], ptr[2], ptr[3]);
-
if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
dprintk("%s slot is busy\n", __func__);
- status = -EAGAIN;
- goto out;
+ return false;
}
-out:
- dprintk("%s status=%d\n", __func__, status);
- return status;
+ return true;
}
/*
@@ -780,20 +791,19 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
struct nfs4_client *clp = dp->dl_client;
u32 minorversion = clp->cl_minorversion;
- int status = 0;
cb->cb_minorversion = minorversion;
if (minorversion) {
- status = nfsd41_cb_setup_sequence(clp, task);
- if (status) {
- if (status != -EAGAIN) {
- /* terminate rpc task */
- task->tk_status = status;
- task->tk_action = NULL;
- }
+ if (!nfsd41_cb_get_slot(clp, task))
return;
- }
}
+ spin_lock(&clp->cl_lock);
+ if (list_empty(&cb->cb_per_client)) {
+ /* This is the first call, not a restart */
+ cb->cb_done = false;
+ list_add(&cb->cb_per_client, &clp->cl_callbacks);
+ }
+ spin_unlock(&clp->cl_lock);
rpc_call_start(task);
}
@@ -829,15 +839,18 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
nfsd4_cb_done(task, calldata);
- if (current_rpc_client == NULL) {
- /* We're shutting down; give up. */
- /* XXX: err, or is it ok just to fall through
- * and rpc_restart_call? */
+ if (current_rpc_client != task->tk_client) {
+ /* We're shutting down or changing cl_cb_client; leave
+ * it to nfsd4_process_cb_update to restart the call if
+ * necessary. */
return;
}
+ if (cb->cb_done)
+ return;
switch (task->tk_status) {
case 0:
+ cb->cb_done = true;
return;
case -EBADHANDLE:
case -NFS4ERR_BAD_STATEID:
@@ -846,32 +859,30 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
break;
default:
/* Network partition? */
- atomic_set(&clp->cl_cb_set, 0);
- warn_no_callback_path(clp, task->tk_status);
- if (current_rpc_client != task->tk_client) {
- /* queue a callback on the new connection: */
- atomic_inc(&dp->dl_count);
- nfsd4_cb_recall(dp);
- return;
- }
+ nfsd4_mark_cb_down(clp, task->tk_status);
}
if (dp->dl_retries--) {
rpc_delay(task, 2*HZ);
task->tk_status = 0;
rpc_restart_call_prepare(task);
return;
- } else {
- atomic_set(&clp->cl_cb_set, 0);
- warn_no_callback_path(clp, task->tk_status);
}
+ nfsd4_mark_cb_down(clp, task->tk_status);
+ cb->cb_done = true;
}
static void nfsd4_cb_recall_release(void *calldata)
{
struct nfsd4_callback *cb = calldata;
+ struct nfs4_client *clp = cb->cb_clp;
struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
- nfs4_put_delegation(dp);
+ if (cb->cb_done) {
+ spin_lock(&clp->cl_lock);
+ list_del(&cb->cb_per_client);
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_delegation(dp);
+ }
}
static const struct rpc_call_ops nfsd4_cb_recall_ops = {
@@ -906,16 +917,33 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
flush_workqueue(callback_wq);
}
-void nfsd4_release_cb(struct nfsd4_callback *cb)
+static void nfsd4_release_cb(struct nfsd4_callback *cb)
{
if (cb->cb_ops->rpc_release)
cb->cb_ops->rpc_release(cb);
}
-void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+/* requires cl_lock: */
+static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
+{
+ struct nfsd4_session *s;
+ struct nfsd4_conn *c;
+
+ list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
+ list_for_each_entry(c, &s->se_conns, cn_persession) {
+ if (c->cn_flags & NFS4_CDFC4_BACK)
+ return c;
+ }
+ }
+ return NULL;
+}
+
+static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
{
struct nfs4_cb_conn conn;
struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd4_session *ses = NULL;
+ struct nfsd4_conn *c;
int err;
/*
@@ -926,6 +954,10 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
}
+ if (clp->cl_cb_conn.cb_xprt) {
+ svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+ clp->cl_cb_conn.cb_xprt = NULL;
+ }
if (test_bit(NFSD4_CLIENT_KILL, &clp->cl_cb_flags))
return;
spin_lock(&clp->cl_lock);
@@ -936,11 +968,22 @@ void nfsd4_process_cb_update(struct nfsd4_callback *cb)
BUG_ON(!clp->cl_cb_flags);
clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_cb_flags);
memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
+ c = __nfsd4_find_backchannel(clp);
+ if (c) {
+ svc_xprt_get(c->cn_xprt);
+ conn.cb_xprt = c->cn_xprt;
+ ses = c->cn_session;
+ }
spin_unlock(&clp->cl_lock);
- err = setup_callback_client(clp, &conn);
- if (err)
+ err = setup_callback_client(clp, &conn, ses);
+ if (err) {
warn_no_callback_path(clp, err);
+ return;
+ }
+ /* Yay, the callback channel's back! Restart any callbacks: */
+ list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
+ run_nfsd4_cb(cb);
}
void nfsd4_do_callback_rpc(struct work_struct *w)
@@ -965,10 +1008,11 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
void nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfsd4_callback *cb = &dp->dl_recall;
+ struct nfs4_client *clp = dp->dl_client;
dp->dl_retries = 1;
cb->cb_op = dp;
- cb->cb_clp = dp->dl_client;
+ cb->cb_clp = clp;
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
@@ -977,5 +1021,8 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
cb->cb_ops = &nfsd4_cb_recall_ops;
dp->dl_retries = 1;
- queue_work(callback_wq, &dp->dl_recall.cb_work);
+ INIT_LIST_HEAD(&cb->cb_per_client);
+ cb->cb_done = true;
+
+ run_nfsd4_cb(&dp->dl_recall);
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f0695e815f0e..6d2c397d458b 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -33,10 +33,11 @@
*/
#include <linux/module.h>
-#include <linux/nfsd_idmap.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include "idmap.h"
+#include "nfsd.h"
/*
* Cache entry
@@ -514,7 +515,7 @@ rqst_authname(struct svc_rqst *rqstp)
return clp->name;
}
-static int
+static __be32
idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
uid_t *id)
{
@@ -524,15 +525,15 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
int ret;
if (namelen + 1 > sizeof(key.name))
- return -EINVAL;
+ return nfserr_badowner;
memcpy(key.name, name, namelen);
key.name[namelen] = '\0';
strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
if (ret == -ENOENT)
- ret = -ESRCH; /* nfserr_badname */
+ return nfserr_badowner;
if (ret)
- return ret;
+ return nfserrno(ret);
*id = item->id;
cache_put(&item->h, &nametoid_cache);
return 0;
@@ -560,14 +561,14 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
return ret;
}
-int
+__be32
nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
__u32 *id)
{
return idmap_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
}
-int
+__be32
nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
__u32 *id)
{
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0cdfd022bb7b..db52546143d1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -604,9 +604,7 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
-static __be32
-nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- void *arg)
+static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
{
struct svc_fh tmp_fh;
__be32 ret;
@@ -615,13 +613,19 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
ret = exp_pseudoroot(rqstp, &tmp_fh);
if (ret)
return ret;
- if (tmp_fh.fh_dentry == cstate->current_fh.fh_dentry) {
+ if (tmp_fh.fh_dentry == fh->fh_dentry) {
fh_put(&tmp_fh);
return nfserr_noent;
}
fh_put(&tmp_fh);
- return nfsd_lookup(rqstp, &cstate->current_fh,
- "..", 2, &cstate->current_fh);
+ return nfsd_lookup(rqstp, fh, "..", 2, fh);
+}
+
+static __be32
+nfsd4_lookupp(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ void *arg)
+{
+ return nfsd4_do_lookupp(rqstp, &cstate->current_fh);
}
static __be32
@@ -769,10 +773,36 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
} else
secinfo->si_exp = exp;
dput(dentry);
+ if (cstate->minorversion)
+ /* See rfc 5661 section 2.6.3.1.1.8 */
+ fh_put(&cstate->current_fh);
return err;
}
static __be32
+nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_secinfo_no_name *sin)
+{
+ __be32 err;
+
+ switch (sin->sin_style) {
+ case NFS4_SECINFO_STYLE4_CURRENT_FH:
+ break;
+ case NFS4_SECINFO_STYLE4_PARENT:
+ err = nfsd4_do_lookupp(rqstp, &cstate->current_fh);
+ if (err)
+ return err;
+ break;
+ default:
+ return nfserr_inval;
+ }
+ exp_get(cstate->current_fh.fh_export);
+ sin->sin_exp = cstate->current_fh.fh_export;
+ fh_put(&cstate->current_fh);
+ return nfs_ok;
+}
+
+static __be32
nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setattr *setattr)
{
@@ -974,8 +1004,8 @@ static const char *nfsd4_op_name(unsigned opnum);
* Also note, enforced elsewhere:
* - SEQUENCE other than as first op results in
* NFS4ERR_SEQUENCE_POS. (Enforced in nfsd4_sequence().)
- * - BIND_CONN_TO_SESSION must be the only op in its compound
- * (Will be enforced in nfsd4_bind_conn_to_session().)
+ * - BIND_CONN_TO_SESSION must be the only op in its compound.
+ * (Enforced in nfsd4_bind_conn_to_session().)
* - DESTROY_SESSION must be the final operation in a compound, if
* sessionid's in SEQUENCE and DESTROY_SESSION are the same.
* (Enforced in nfsd4_destroy_session().)
@@ -1126,10 +1156,6 @@ encode_op:
nfsd4_increment_op_stats(op->opnum);
}
- if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
- dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
- status = nfserr_jukebox;
- }
resp->cstate.status = status;
fh_put(&resp->cstate.current_fh);
@@ -1300,6 +1326,11 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
.op_name = "OP_EXCHANGE_ID",
},
+ [OP_BIND_CONN_TO_SESSION] = {
+ .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
+ .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+ .op_name = "OP_BIND_CONN_TO_SESSION",
+ },
[OP_CREATE_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_create_session,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
@@ -1320,6 +1351,10 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_flags = ALLOWED_WITHOUT_FH,
.op_name = "OP_RECLAIM_COMPLETE",
},
+ [OP_SECINFO_NO_NAME] = {
+ .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+ .op_name = "OP_SECINFO_NO_NAME",
+ },
};
static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7e26caab2a26..ffb59ef6f82f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -302,7 +302,6 @@ purge_old(struct dentry *parent, struct dentry *child)
{
int status;
- /* note: we currently use this path only for minorversion 0 */
if (nfs4_has_reclaimed_state(child->d_name.name, false))
return 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fbd18c3074bb..d98d0213285d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,7 +230,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
dp->dl_client = clp;
get_nfs4_file(fp);
dp->dl_file = fp;
- nfs4_file_get_access(fp, O_RDONLY);
+ dp->dl_vfs_file = find_readable_file(fp);
+ get_file(dp->dl_vfs_file);
dp->dl_flock = NULL;
dp->dl_type = type;
dp->dl_stateid.si_boot = boot_time;
@@ -252,6 +253,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
if (atomic_dec_and_test(&dp->dl_count)) {
dprintk("NFSD: freeing dp %p\n",dp);
put_nfs4_file(dp->dl_file);
+ fput(dp->dl_vfs_file);
kmem_cache_free(deleg_slab, dp);
num_delegations--;
}
@@ -265,12 +267,10 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
static void
nfs4_close_delegation(struct nfs4_delegation *dp)
{
- struct file *filp = find_readable_file(dp->dl_file);
-
dprintk("NFSD: close_delegation dp %p\n",dp);
+ /* XXX: do we even need this check?: */
if (dp->dl_flock)
- vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
- nfs4_file_put_access(dp->dl_file, O_RDONLY);
+ vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
}
/* Called under the state lock. */
@@ -642,6 +642,7 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
free_conn(c);
}
spin_unlock(&clp->cl_lock);
+ nfsd4_probe_callback(clp);
}
static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -679,15 +680,12 @@ static int nfsd4_register_conn(struct nfsd4_conn *conn)
return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user);
}
-static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses, u32 dir)
{
struct nfsd4_conn *conn;
- u32 flags = NFS4_CDFC4_FORE;
int ret;
- if (ses->se_flags & SESSION4_BACK_CHAN)
- flags |= NFS4_CDFC4_BACK;
- conn = alloc_conn(rqstp, flags);
+ conn = alloc_conn(rqstp, dir);
if (!conn)
return nfserr_jukebox;
nfsd4_hash_conn(conn, ses);
@@ -698,6 +696,17 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses)
return nfs_ok;
}
+static __be32 nfsd4_new_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_session *ses)
+{
+ u32 dir = NFS4_CDFC4_FORE;
+
+ if (ses->se_flags & SESSION4_BACK_CHAN)
+ dir |= NFS4_CDFC4_BACK;
+
+ return nfsd4_new_conn(rqstp, ses, dir);
+}
+
+/* must be called under client_lock */
static void nfsd4_del_conns(struct nfsd4_session *s)
{
struct nfs4_client *clp = s->se_client;
@@ -749,6 +758,8 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
*/
slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
+ if (numslots < 1)
+ return NULL;
new = alloc_session(slotsize, numslots);
if (!new) {
@@ -769,25 +780,30 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
idx = hash_sessionid(&new->se_sessionid);
spin_lock(&client_lock);
list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+ spin_lock(&clp->cl_lock);
list_add(&new->se_perclnt, &clp->cl_sessions);
+ spin_unlock(&clp->cl_lock);
spin_unlock(&client_lock);
- status = nfsd4_new_conn(rqstp, new);
+ status = nfsd4_new_conn_from_crses(rqstp, new);
/* whoops: benny points out, status is ignored! (err, or bogus) */
if (status) {
free_session(&new->se_ref);
return NULL;
}
- if (!clp->cl_cb_session && (cses->flags & SESSION4_BACK_CHAN)) {
+ if (cses->flags & SESSION4_BACK_CHAN) {
struct sockaddr *sa = svc_addr(rqstp);
-
- clp->cl_cb_session = new;
- clp->cl_cb_conn.cb_xprt = rqstp->rq_xprt;
- svc_xprt_get(rqstp->rq_xprt);
+ /*
+ * This is a little silly; with sessions there's no real
+ * use for the callback address. Use the peer address
+ * as a reasonable default for now, but consider fixing
+ * the rpc client not to require an address in the
+ * future:
+ */
rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
- nfsd4_probe_callback(clp);
}
+ nfsd4_probe_callback(clp);
return new;
}
@@ -817,7 +833,9 @@ static void
unhash_session(struct nfsd4_session *ses)
{
list_del(&ses->se_hash);
+ spin_lock(&ses->se_client->cl_lock);
list_del(&ses->se_perclnt);
+ spin_unlock(&ses->se_client->cl_lock);
}
/* must be called under the client_lock */
@@ -923,8 +941,10 @@ unhash_client_locked(struct nfs4_client *clp)
mark_client_expired(clp);
list_del(&clp->cl_lru);
+ spin_lock(&clp->cl_lock);
list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
list_del_init(&ses->se_hash);
+ spin_unlock(&clp->cl_lock);
}
static void
@@ -1051,12 +1071,13 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
atomic_set(&clp->cl_refcount, 0);
- atomic_set(&clp->cl_cb_set, 0);
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
INIT_LIST_HEAD(&clp->cl_idhash);
INIT_LIST_HEAD(&clp->cl_strhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_lru);
+ INIT_LIST_HEAD(&clp->cl_callbacks);
spin_lock_init(&clp->cl_lock);
INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
clp->cl_time = get_seconds();
@@ -1132,54 +1153,55 @@ find_unconfirmed_client(clientid_t *clid)
return NULL;
}
-/*
- * Return 1 iff clp's clientid establishment method matches the use_exchange_id
- * parameter. Matching is based on the fact the at least one of the
- * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
- *
- * FIXME: we need to unify the clientid namespaces for nfsv4.x
- * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
- * and SET_CLIENTID{,_CONFIRM}
- */
-static inline int
-match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+static bool clp_used_exchangeid(struct nfs4_client *clp)
{
- bool has_exchange_flags = (clp->cl_exchange_flags != 0);
- return use_exchange_id == has_exchange_flags;
-}
+ return clp->cl_exchange_flags != 0;
+}
static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval,
- bool use_exchange_id)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval)
{
struct nfs4_client *clp;
list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname) &&
- match_clientid_establishment(clp, use_exchange_id))
+ if (same_name(clp->cl_recdir, dname))
return clp;
}
return NULL;
}
static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
- bool use_exchange_id)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
{
struct nfs4_client *clp;
list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname) &&
- match_clientid_establishment(clp, use_exchange_id))
+ if (same_name(clp->cl_recdir, dname))
return clp;
}
return NULL;
}
+static void rpc_svcaddr2sockaddr(struct sockaddr *sa, unsigned short family, union svc_addr_u *svcaddr)
+{
+ switch (family) {
+ case AF_INET:
+ ((struct sockaddr_in *)sa)->sin_family = AF_INET;
+ ((struct sockaddr_in *)sa)->sin_addr = svcaddr->addr;
+ return;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)sa)->sin6_family = AF_INET6;
+ ((struct sockaddr_in6 *)sa)->sin6_addr = svcaddr->addr6;
+ return;
+ }
+}
+
static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_rqst *rqstp)
{
struct nfs4_cb_conn *conn = &clp->cl_cb_conn;
+ struct sockaddr *sa = svc_addr(rqstp);
+ u32 scopeid = rpc_get_scope_id(sa);
unsigned short expected_family;
/* Currently, we only support tcp and tcp6 for the callback channel */
@@ -1205,6 +1227,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
conn->cb_prog = se->se_callback_prog;
conn->cb_ident = se->se_callback_ident;
+ rpc_svcaddr2sockaddr((struct sockaddr *)&conn->cb_saddr, expected_family, &rqstp->rq_daddr);
return;
out_err:
conn->cb_addr.ss_family = AF_UNSPEC;
@@ -1344,7 +1367,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
case SP4_NONE:
break;
case SP4_SSV:
- return nfserr_encr_alg_unsupp;
+ return nfserr_serverfault;
default:
BUG(); /* checked by xdr code */
case SP4_MACH_CRED:
@@ -1361,8 +1384,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
nfs4_lock_state();
status = nfs_ok;
- conf = find_confirmed_client_by_str(dname, strhashval, true);
+ conf = find_confirmed_client_by_str(dname, strhashval);
if (conf) {
+ if (!clp_used_exchangeid(conf)) {
+ status = nfserr_clid_inuse; /* XXX: ? */
+ goto out;
+ }
if (!same_verf(&verf, &conf->cl_verifier)) {
/* 18.35.4 case 8 */
if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
@@ -1403,7 +1430,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
goto out;
}
- unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
+ unconf = find_unconfirmed_client_by_str(dname, strhashval);
if (unconf) {
/*
* Possible retry or client restart. Per 18.35.4 case 4,
@@ -1560,6 +1587,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfs_ok;
memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
+ memcpy(&cr_ses->fore_channel, &new->se_fchannel,
+ sizeof(struct nfsd4_channel_attrs));
cs_slot->sl_seqid++;
cr_ses->seqid = cs_slot->sl_seqid;
@@ -1581,6 +1610,45 @@ static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
return argp->opcnt == resp->opcnt;
}
+static __be32 nfsd4_map_bcts_dir(u32 *dir)
+{
+ switch (*dir) {
+ case NFS4_CDFC4_FORE:
+ case NFS4_CDFC4_BACK:
+ return nfs_ok;
+ case NFS4_CDFC4_FORE_OR_BOTH:
+ case NFS4_CDFC4_BACK_OR_BOTH:
+ *dir = NFS4_CDFC4_BOTH;
+ return nfs_ok;
+ };
+ return nfserr_inval;
+}
+
+__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_bind_conn_to_session *bcts)
+{
+ __be32 status;
+
+ if (!nfsd4_last_compound_op(rqstp))
+ return nfserr_not_only_op;
+ spin_lock(&client_lock);
+ cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+ /* Sorta weird: we only need the refcnt'ing because new_conn acquires
+ * client_lock iself: */
+ if (cstate->session) {
+ nfsd4_get_session(cstate->session);
+ atomic_inc(&cstate->session->se_client->cl_refcount);
+ }
+ spin_unlock(&client_lock);
+ if (!cstate->session)
+ return nfserr_badsession;
+
+ status = nfsd4_map_bcts_dir(&bcts->dir);
+ nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+ return nfs_ok;
+}
+
static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
{
if (!session)
@@ -1619,8 +1687,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
spin_unlock(&client_lock);
nfs4_lock_state();
- /* wait for callbacks */
- nfsd4_shutdown_callback(ses->se_client);
+ nfsd4_probe_callback_sync(ses->se_client);
nfs4_unlock_state();
nfsd4_del_conns(ses);
@@ -1733,8 +1800,12 @@ nfsd4_sequence(struct svc_rqst *rqstp,
out:
/* Hold a session reference until done processing the compound. */
if (cstate->session) {
+ struct nfs4_client *clp = session->se_client;
+
nfsd4_get_session(cstate->session);
- atomic_inc(&session->se_client->cl_refcount);
+ atomic_inc(&clp->cl_refcount);
+ if (clp->cl_cb_state == NFSD4_CB_DOWN)
+ seq->status_flags |= SEQ4_STATUS_CB_PATH_DOWN;
}
kfree(conn);
spin_unlock(&client_lock);
@@ -1775,7 +1846,6 @@ __be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
{
- struct sockaddr *sa = svc_addr(rqstp);
struct xdr_netobj clname = {
.len = setclid->se_namelen,
.data = setclid->se_name,
@@ -1801,10 +1871,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
strhashval = clientstr_hashval(dname);
nfs4_lock_state();
- conf = find_confirmed_client_by_str(dname, strhashval, false);
+ conf = find_confirmed_client_by_str(dname, strhashval);
if (conf) {
/* RFC 3530 14.2.33 CASE 0: */
status = nfserr_clid_inuse;
+ if (clp_used_exchangeid(conf))
+ goto out;
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
char addr_str[INET6_ADDRSTRLEN];
rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
@@ -1819,7 +1891,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* has a description of SETCLIENTID request processing consisting
* of 5 bullet points, labeled as CASE0 - CASE4 below.
*/
- unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
+ unconf = find_unconfirmed_client_by_str(dname, strhashval);
status = nfserr_resource;
if (!conf) {
/*
@@ -1876,7 +1948,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* for consistent minorversion use throughout:
*/
new->cl_minorversion = 0;
- gen_callback(new, setclid, rpc_get_scope_id(sa));
+ gen_callback(new, setclid, rqstp);
add_to_unconfirmed(new, strhashval);
setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1935,7 +2007,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
status = nfserr_clid_inuse;
else {
- atomic_set(&conf->cl_cb_set, 0);
nfsd4_change_callback(conf, &unconf->cl_cb_conn);
nfsd4_probe_callback(conf);
expire_client(unconf);
@@ -1964,7 +2035,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
unsigned int hash =
clientstr_hashval(unconf->cl_recdir);
conf = find_confirmed_client_by_str(unconf->cl_recdir,
- hash, false);
+ hash);
if (conf) {
nfsd4_remove_clid_dir(conf);
expire_client(conf);
@@ -2300,41 +2371,6 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
nfsd4_cb_recall(dp);
}
-/*
- * The file_lock is being reapd.
- *
- * Called by locks_free_lock() with lock_flocks() held.
- */
-static
-void nfsd_release_deleg_cb(struct file_lock *fl)
-{
- struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-
- dprintk("NFSD nfsd_release_deleg_cb: fl %p dp %p dl_count %d\n", fl,dp, atomic_read(&dp->dl_count));
-
- if (!(fl->fl_flags & FL_LEASE) || !dp)
- return;
- dp->dl_flock = NULL;
-}
-
-/*
- * Called from setlease() with lock_flocks() held
- */
-static
-int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try)
-{
- struct nfs4_delegation *onlistd =
- (struct nfs4_delegation *)onlist->fl_owner;
- struct nfs4_delegation *tryd =
- (struct nfs4_delegation *)try->fl_owner;
-
- if (onlist->fl_lmops != try->fl_lmops)
- return 0;
-
- return onlistd->dl_client == tryd->dl_client;
-}
-
-
static
int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
{
@@ -2346,8 +2382,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
static const struct lock_manager_operations nfsd_lease_mng_ops = {
.fl_break = nfsd_break_deleg_cb,
- .fl_release_private = nfsd_release_deleg_cb,
- .fl_mylease = nfsd_same_client_deleg_cb,
.fl_change = nfsd_change_deleg_cb,
};
@@ -2514,8 +2548,6 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
if (!fp->fi_fds[oflag]) {
status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
&fp->fi_fds[oflag]);
- if (status == nfserr_dropit)
- status = nfserr_jukebox;
if (status)
return status;
}
@@ -2596,6 +2628,19 @@ nfs4_set_claim_prev(struct nfsd4_open *open)
open->op_stateowner->so_client->cl_firststate = 1;
}
+/* Should we give out recallable state?: */
+static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
+{
+ if (clp->cl_cb_state == NFSD4_CB_UP)
+ return true;
+ /*
+ * In the sessions case, since we don't have to establish a
+ * separate connection for callbacks, we assume it's OK
+ * until we hear otherwise:
+ */
+ return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
+}
+
/*
* Attempt to hand out a delegation.
*/
@@ -2604,10 +2649,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
{
struct nfs4_delegation *dp;
struct nfs4_stateowner *sop = stp->st_stateowner;
- int cb_up = atomic_read(&sop->so_client->cl_cb_set);
+ int cb_up;
struct file_lock *fl;
int status, flag = 0;
+ cb_up = nfsd4_cb_channel_good(sop->so_client);
flag = NFS4_OPEN_DELEGATE_NONE;
open->op_recall = 0;
switch (open->op_claim_type) {
@@ -2655,7 +2701,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
dp->dl_flock = fl;
/* vfs_setlease checks to see if delegation should be handed out.
- * the lock_manager callbacks fl_mylease and fl_change are used
+ * the lock_manager callback fl_change is used
*/
if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
dprintk("NFSD: setlease failed [%d], no delegation\n", status);
@@ -2794,7 +2840,7 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
renew_client(clp);
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
- && !atomic_read(&clp->cl_cb_set))
+ && clp->cl_cb_state != NFSD4_CB_UP)
goto out;
status = nfs_ok;
out:
@@ -3081,9 +3127,10 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
if (status)
goto out;
renew_client(dp->dl_client);
- if (filpp)
+ if (filpp) {
*filpp = find_readable_file(dp->dl_file);
- BUG_ON(!*filpp);
+ BUG_ON(!*filpp);
+ }
} else { /* open or lock stateid */
stp = find_stateid(stateid, flags);
if (!stp)
@@ -4107,7 +4154,7 @@ nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
unsigned int strhashval = clientstr_hashval(name);
struct nfs4_client *clp;
- clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
+ clp = find_confirmed_client_by_str(name, strhashval);
return clp ? 1 : 0;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f35a94a04026..956629b9cdc9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -44,13 +44,14 @@
#include <linux/namei.h>
#include <linux/statfs.h>
#include <linux/utsname.h>
-#include <linux/nfsd_idmap.h>
-#include <linux/nfs4_acl.h>
#include <linux/sunrpc/svcauth_gss.h>
+#include "idmap.h"
+#include "acl.h"
#include "xdr4.h"
#include "vfs.h"
+
#define NFSDDBG_FACILITY NFSDDBG_XDR
/*
@@ -288,17 +289,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
len += XDR_QUADLEN(dummy32) << 2;
READMEM(buf, dummy32);
ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
- host_err = 0;
+ status = nfs_ok;
if (ace->whotype != NFS4_ACL_WHO_NAMED)
ace->who = 0;
else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
- host_err = nfsd_map_name_to_gid(argp->rqstp,
+ status = nfsd_map_name_to_gid(argp->rqstp,
buf, dummy32, &ace->who);
else
- host_err = nfsd_map_name_to_uid(argp->rqstp,
+ status = nfsd_map_name_to_uid(argp->rqstp,
buf, dummy32, &ace->who);
- if (host_err)
- goto out_nfserr;
+ if (status)
+ return status;
}
} else
*acl = NULL;
@@ -420,6 +421,21 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
DECODE_TAIL;
}
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+ DECODE_HEAD;
+ u32 dummy;
+
+ READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+ COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ READ32(bcts->dir);
+ /* XXX: Perhaps Tom Tucker could help us figure out how we
+ * should be using ctsa_use_conn_in_rdma_mode: */
+ READ32(dummy);
+
+ DECODE_TAIL;
+}
+
static __be32
nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
{
@@ -847,6 +863,17 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
}
static __be32
+nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+ struct nfsd4_secinfo_no_name *sin)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ READ32(sin->sin_style);
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
{
__be32 status;
@@ -1005,7 +1032,7 @@ static __be32
nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
struct nfsd4_exchange_id *exid)
{
- int dummy;
+ int dummy, tmp;
DECODE_HEAD;
READ_BUF(NFS4_VERIFIER_SIZE);
@@ -1053,15 +1080,23 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
/* ssp_hash_algs<> */
READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
+ READ32(tmp);
+ while (tmp--) {
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ }
/* ssp_encr_algs<> */
READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
+ READ32(tmp);
+ while (tmp--) {
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ }
/* ssp_window and ssp_num_gss_handles */
READ_BUF(8);
@@ -1339,7 +1374,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
/* new operations for NFSv4.1 */
[OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
[OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
[OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
@@ -1350,7 +1385,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
[OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2309,8 +2344,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
case nfserr_resource:
nfserr = nfserr_toosmall;
goto fail;
- case nfserr_dropit:
- goto fail;
case nfserr_noent:
goto skip_entry;
default:
@@ -2365,6 +2398,21 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
return nfserr;
}
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+{
+ __be32 *p;
+
+ if (!nfserr) {
+ RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
+ WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ WRITE32(bcts->dir);
+ /* XXX: ? */
+ WRITE32(0);
+ ADJUST_ARGS();
+ }
+ return nfserr;
+}
+
static __be32
nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
@@ -2826,11 +2874,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_secinfo *secinfo)
+nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
+ __be32 nfserr,struct svc_export *exp)
{
int i = 0;
- struct svc_export *exp = secinfo->si_exp;
u32 nflavs;
struct exp_flavor_info *flavs;
struct exp_flavor_info def_flavs[2];
@@ -2892,6 +2939,20 @@ out:
return nfserr;
}
+static __be32
+nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_secinfo *secinfo)
+{
+ return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+}
+
+static __be32
+nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_secinfo_no_name *secinfo)
+{
+ return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+}
+
/*
* The SETATTR encode routine is special -- it always encodes a bitmap,
* regardless of the error status.
@@ -3076,13 +3137,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
WRITE32(seq->seqid);
WRITE32(seq->slotid);
WRITE32(seq->maxslots);
- /*
- * FIXME: for now:
- * target_maxslots = maxslots
- * status_flags = 0
- */
+ /* For now: target_maxslots = maxslots */
WRITE32(seq->maxslots);
- WRITE32(0);
+ WRITE32(seq->status_flags);
ADJUST_ARGS();
resp->cstate.datap = p; /* DRC cache data pointer */
@@ -3143,7 +3200,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
/* NFSv4.1 operations */
[OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
[OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
[OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
@@ -3154,7 +3211,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
[OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
[OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4514ebbee4d6..33b3e2b06779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -8,12 +8,12 @@
#include <linux/namei.h>
#include <linux/ctype.h>
-#include <linux/nfsd_idmap.h>
#include <linux/sunrpc/svcsock.h>
#include <linux/nfsd/syscall.h>
#include <linux/lockd/lockd.h>
#include <linux/sunrpc/clnt.h>
+#include "idmap.h"
#include "nfsd.h"
#include "cache.h"
@@ -127,6 +127,7 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu
static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
+#ifdef CONFIG_NFSD_DEPRECATED
static int warned;
if (file->f_dentry->d_name.name[0] == '.' && !warned) {
printk(KERN_INFO
@@ -135,6 +136,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
current->comm, file->f_dentry->d_name.name);
warned = 1;
}
+#endif
if (! file->private_data) {
/* An attempt to read a transaction file without writing
* causes a 0-byte write so that the file can return
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6b641cf2c19a..7ecfa2420307 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void);
#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
#define nfserr_grace cpu_to_be32(NFSERR_GRACE)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08e17264784b..e15dc45fc5ec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -735,9 +735,9 @@ nfserrno (int errno)
{ nfserr_stale, -ESTALE },
{ nfserr_jukebox, -ETIMEDOUT },
{ nfserr_jukebox, -ERESTARTSYS },
- { nfserr_dropit, -EAGAIN },
- { nfserr_dropit, -ENOMEM },
- { nfserr_badname, -ESRCH },
+ { nfserr_jukebox, -EAGAIN },
+ { nfserr_jukebox, -EWOULDBLOCK },
+ { nfserr_jukebox, -ENOMEM },
{ nfserr_io, -ETXTBSY },
{ nfserr_notsupp, -EOPNOTSUPP },
{ nfserr_toosmall, -ETOOSMALL },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2bae1d86f5f2..18743c4d8bca 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -608,7 +608,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
/* Now call the procedure handler, and encode NFS status. */
nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
nfserr = map_new_errors(rqstp->rq_vers, nfserr);
- if (nfserr == nfserr_dropit) {
+ if (nfserr == nfserr_dropit || rqstp->rq_dropme) {
dprintk("nfsd: Dropping request; may be revisited later\n");
nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
return 0;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 39adc27b0685..3074656ba7bf 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,10 +68,12 @@ typedef struct {
struct nfsd4_callback {
void *cb_op;
struct nfs4_client *cb_clp;
+ struct list_head cb_per_client;
u32 cb_minorversion;
struct rpc_message cb_msg;
const struct rpc_call_ops *cb_ops;
struct work_struct cb_work;
+ bool cb_done;
};
struct nfs4_delegation {
@@ -81,6 +83,7 @@ struct nfs4_delegation {
atomic_t dl_count; /* ref count */
struct nfs4_client *dl_client;
struct nfs4_file *dl_file;
+ struct file *dl_vfs_file;
struct file_lock *dl_flock;
u32 dl_type;
time_t dl_time;
@@ -95,6 +98,7 @@ struct nfs4_delegation {
struct nfs4_cb_conn {
/* SETCLIENTID info */
struct sockaddr_storage cb_addr;
+ struct sockaddr_storage cb_saddr;
size_t cb_addrlen;
u32 cb_prog; /* used only in 4.0 case;
per-session otherwise */
@@ -146,6 +150,11 @@ struct nfsd4_create_session {
u32 gid;
};
+struct nfsd4_bind_conn_to_session {
+ struct nfs4_sessionid sessionid;
+ u32 dir;
+};
+
/* The single slot clientid cache structure */
struct nfsd4_clid_slot {
u32 sl_seqid;
@@ -235,9 +244,13 @@ struct nfs4_client {
unsigned long cl_cb_flags;
struct rpc_clnt *cl_cb_client;
u32 cl_cb_ident;
- atomic_t cl_cb_set;
+#define NFSD4_CB_UP 0
+#define NFSD4_CB_UNKNOWN 1
+#define NFSD4_CB_DOWN 2
+ int cl_cb_state;
struct nfsd4_callback cl_cb_null;
struct nfsd4_session *cl_cb_session;
+ struct list_head cl_callbacks; /* list of in-progress callbacks */
/* for all client information that callback code might need: */
spinlock_t cl_lock;
@@ -454,6 +467,7 @@ extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
extern void nfs4_free_stateowner(struct kref *kref);
extern int set_callback_cred(void);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_do_callback_rpc(struct work_struct *);
extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3a359023c9f7..641117f2188d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,4 +1,3 @@
-#define MSNFS /* HACK HACK */
/*
* File operations used by nfsd. Some of these have been ripped from
* other parts of the kernel because they weren't exported, others
@@ -35,8 +34,8 @@
#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
-#include <linux/nfs4_acl.h>
-#include <linux/nfsd_idmap.h>
+#include "acl.h"
+#include "idmap.h"
#endif /* CONFIG_NFSD_V4 */
#include "nfsd.h"
@@ -88,8 +87,9 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
.dentry = dget(dentry)};
int err = 0;
- while (d_mountpoint(path.dentry) && follow_down(&path))
- ;
+ err = follow_down(&path, false);
+ if (err < 0)
+ goto out;
exp2 = rqst_exp_get_by_name(rqstp, &path);
if (IS_ERR(exp2)) {
@@ -273,6 +273,13 @@ out:
return err;
}
+static int nfsd_break_lease(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ return break_lease(inode, O_WRONLY | O_NONBLOCK);
+}
+
/*
* Commit metadata changes to stable storage.
*/
@@ -375,16 +382,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
goto out;
}
- /*
- * If we are changing the size of the file, then
- * we need to break all leases.
- */
- host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
- if (host_err == -EWOULDBLOCK)
- host_err = -ETIMEDOUT;
- if (host_err) /* ENOMEM or EWOULDBLOCK */
- goto out_nfserr;
-
host_err = get_write_access(inode);
if (host_err)
goto out_nfserr;
@@ -425,7 +422,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
err = nfserr_notsync;
if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
+ host_err = nfsd_break_lease(inode);
+ if (host_err)
+ goto out_nfserr;
fh_lock(fhp);
+
host_err = notify_change(dentry, iap);
err = nfserrno(host_err);
fh_unlock(fhp);
@@ -752,8 +753,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
*/
if (!(access & NFSD_MAY_NOT_BREAK_LEASE))
host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
- if (host_err == -EWOULDBLOCK)
- host_err = -ETIMEDOUT;
if (host_err) /* NOMEM or WOULDBLOCK */
goto out_nfserr;
@@ -845,11 +844,6 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
struct page *page = buf->page;
size_t size;
- int ret;
-
- ret = buf->ops->confirm(pipe, buf);
- if (unlikely(ret))
- return ret;
size = sd->len;
@@ -879,15 +873,6 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
-static inline int svc_msnfs(struct svc_fh *ffhp)
-{
-#ifdef MSNFS
- return (ffhp->fh_export->ex_flags & NFSEXP_MSNFS);
-#else
- return 0;
-#endif
-}
-
static __be32
nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
@@ -900,9 +885,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
err = nfserr_perm;
inode = file->f_path.dentry->d_inode;
- if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
- goto out;
-
if (file->f_op->splice_read && rqstp->rq_splice_ok) {
struct splice_desc sd = {
.len = 0,
@@ -927,7 +909,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
fsnotify_access(file);
} else
err = nfserrno(host_err);
-out:
return err;
}
@@ -992,14 +973,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
int stable = *stablep;
int use_wgather;
-#ifdef MSNFS
- err = nfserr_perm;
-
- if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
- (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
- goto out;
-#endif
-
dentry = file->f_path.dentry;
inode = dentry->d_inode;
exp = fhp->fh_export;
@@ -1050,7 +1023,6 @@ out_nfserr:
err = 0;
else
err = nfserrno(host_err);
-out:
return err;
}
@@ -1670,6 +1642,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = nfserrno(host_err);
goto out_dput;
}
+ err = nfserr_noent;
+ if (!dold->d_inode)
+ goto out_drop_write;
+ host_err = nfsd_break_lease(dold->d_inode);
+ if (host_err)
+ goto out_drop_write;
host_err = vfs_link(dold, dirp, dnew);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
@@ -1681,6 +1659,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
else
err = nfserrno(host_err);
}
+out_drop_write:
mnt_drop_write(tfhp->fh_export->ex_path.mnt);
out_dput:
dput(dnew);
@@ -1755,12 +1734,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (ndentry == trap)
goto out_dput_new;
- if (svc_msnfs(ffhp) &&
- ((odentry->d_count > 1) || (ndentry->d_count > 1))) {
- host_err = -EPERM;
- goto out_dput_new;
- }
-
host_err = -EXDEV;
if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
goto out_dput_new;
@@ -1768,15 +1741,17 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (host_err)
goto out_dput_new;
+ host_err = nfsd_break_lease(odentry->d_inode);
+ if (host_err)
+ goto out_drop_write;
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
host_err = commit_metadata(ffhp);
}
-
+out_drop_write:
mnt_drop_write(ffhp->fh_export->ex_path.mnt);
-
out_dput_new:
dput(ndentry);
out_dput_old:
@@ -1839,18 +1814,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (host_err)
goto out_nfserr;
- if (type != S_IFDIR) { /* It's UNLINK */
-#ifdef MSNFS
- if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
- (rdentry->d_count > 1)) {
- host_err = -EPERM;
- } else
-#endif
+ host_err = nfsd_break_lease(rdentry->d_inode);
+ if (host_err)
+ goto out_put;
+ if (type != S_IFDIR)
host_err = vfs_unlink(dirp, rdentry);
- } else { /* It's RMDIR */
+ else
host_err = vfs_rmdir(dirp, rdentry);
- }
-
+out_put:
dput(rdentry);
if (!host_err)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 60fce3dc5cb5..366401e1a536 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -311,6 +311,11 @@ struct nfsd4_secinfo {
struct svc_export *si_exp; /* response */
};
+struct nfsd4_secinfo_no_name {
+ u32 sin_style; /* request */
+ struct svc_export *sin_exp; /* response */
+};
+
struct nfsd4_setattr {
stateid_t sa_stateid; /* request */
u32 sa_bmval[3]; /* request */
@@ -373,8 +378,8 @@ struct nfsd4_sequence {
u32 cachethis; /* request */
#if 0
u32 target_maxslots; /* response */
- u32 status_flags; /* response */
#endif /* not yet */
+ u32 status_flags; /* response */
};
struct nfsd4_destroy_session {
@@ -422,6 +427,7 @@ struct nfsd4_op {
/* NFSv4.1 */
struct nfsd4_exchange_id exchange_id;
+ struct nfsd4_bind_conn_to_session bind_conn_to_session;
struct nfsd4_create_session create_session;
struct nfsd4_destroy_session destroy_session;
struct nfsd4_sequence sequence;
@@ -518,6 +524,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
extern __be32 nfsd4_create_session(struct svc_rqst *,
struct nfsd4_compound_state *,
struct nfsd4_create_session *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 70dfdd532b83..0994f6a76c07 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1163,14 +1163,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
{
struct nilfs_super_data sd;
struct super_block *s;
- fmode_t mode = FMODE_READ;
+ fmode_t mode = FMODE_READ | FMODE_EXCL;
struct dentry *root_dentry;
int err, s_new = false;
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
- sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+ sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
if (IS_ERR(sd.bdev))
return ERR_CAST(sd.bdev);
@@ -1249,7 +1249,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s_new)
- close_bdev_exclusive(sd.bdev, mode);
+ blkdev_put(sd.bdev, mode);
return root_dentry;
@@ -1258,7 +1258,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
failed:
if (!s_new)
- close_bdev_exclusive(sd.bdev, mode);
+ blkdev_put(sd.bdev, mode);
return ERR_PTR(err);
}
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 3ac36b7bf6b9..7dceff005a67 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -6,7 +6,7 @@ config FANOTIFY
---help---
Say Y here to enable fanotify suport. fanotify is a file access
notification system which differs from inotify in that it sends
- and open file descriptor to the userspace listener along with
+ an open file descriptor to the userspace listener along with
the event.
If unsure, say Y.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 58b6be992544..4ff028fcfd6e 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
ifeq ($(CONFIG_NTFS_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 113ebd9f25a4..f4b1057abdd2 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
/*
* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -1380,15 +1380,14 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
* pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
* single-segment behaviour.
*
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
- * when atomic and when not atomic. This is ok because
- * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
- * and it is ok to call this when non-atomic.
- * Infact, the only difference between __copy_from_user_inatomic() and
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
+ * atomic and when not atomic. This is ok because it calls
+ * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
+ * fact, the only difference between __copy_from_user_inatomic() and
* __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error. And on many
- * architectures __copy_from_user_inatomic() is just defined to
- * __copy_from_user() so it makes no difference at all on those architectures.
+ * should not zero the tail of the buffer on error. And on many architectures
+ * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
+ * makes no difference at all on those architectures.
*/
static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
unsigned nr_pages, unsigned ofs, const struct iovec **iov,
@@ -1409,28 +1408,28 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
if (unlikely(copied != len)) {
/* Do it the slow way. */
addr = kmap(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
- *iov, *iov_ofs, len);
- /*
- * Zero the rest of the target like __copy_from_user().
- */
- memset(addr + ofs + copied, 0, len - copied);
- kunmap(*pages);
+ copied = __ntfs_copy_from_user_iovec_inatomic(addr +
+ ofs, *iov, *iov_ofs, len);
if (unlikely(copied != len))
goto err_out;
+ kunmap(*pages);
}
total += len;
+ ntfs_set_next_iovec(iov, iov_ofs, len);
bytes -= len;
if (!bytes)
break;
- ntfs_set_next_iovec(iov, iov_ofs, len);
ofs = 0;
} while (++pages < last_page);
out:
return total;
err_out:
- total += copied;
+ BUG_ON(copied > len);
/* Zero the rest of the target like __copy_from_user(). */
+ memset(addr + ofs + copied, 0, len - copied);
+ kunmap(*pages);
+ total += copied;
+ ntfs_set_next_iovec(iov, iov_ofs, copied);
while (++pages < last_page) {
bytes -= len;
if (!bytes)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a30ecacc01f2..29099a07b9fe 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1,7 +1,7 @@
/*
* super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
* Copyright (c) 2001,2002 Richard Russon
*
* This program/include file is free software; you can redistribute it and/or
@@ -3193,8 +3193,8 @@ static void __exit exit_ntfs_fs(void)
ntfs_sysctl(0);
}
-MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2007 Anton Altaparmakov");
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.");
MODULE_VERSION(NTFS_VERSION);
MODULE_LICENSE("GPL");
#ifdef DEBUG
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index ab152c00cd3a..77a8de5f7119 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
config OCFS2_FS
tristate "OCFS2 file system support"
- depends on NET && SYSFS
- select CONFIGFS_FS
+ depends on NET && SYSFS && CONFIGFS_FS
select JBD2
select CRC32
select QUOTA
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a6cc05302e9f..b108e863d8f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1729,7 +1729,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
goto out;
reg->hr_bdev = I_BDEV(filp->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+ ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
if (ret) {
reg->hr_bdev = NULL;
goto out;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 6adafa576065..5dbc3062b4fd 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -137,9 +137,7 @@ check_gen:
}
result = d_obtain_alias(inode);
- if (!IS_ERR(result))
- d_set_d_op(result, &ocfs2_dentry_ops);
- else
+ if (IS_ERR(result))
mlog_errno(PTR_ERR(result));
bail:
@@ -175,8 +173,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
}
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
- if (!IS_ERR(parent))
- d_set_d_op(parent, &ocfs2_dentry_ops);
bail_unlock:
ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bdadbae09094..a6651956482e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1989,28 +1989,32 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
}
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
+ struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_space_resv sr;
int change_size = 1;
+ int cmd = OCFS2_IOC_RESVSP64;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
- if (S_ISDIR(inode->i_mode))
- return -ENODEV;
-
if (mode & FALLOC_FL_KEEP_SIZE)
change_size = 0;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = OCFS2_IOC_UNRESVSP64;
+
sr.l_whence = 0;
sr.l_start = (s64)offset;
sr.l_len = (s64)len;
- return __ocfs2_change_file_space(NULL, inode, offset,
- OCFS2_IOC_RESVSP64, &sr, change_size);
+ return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+ change_size);
}
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -2606,7 +2610,6 @@ const struct inode_operations ocfs2_file_iops = {
.getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
- .fallocate = ocfs2_fallocate,
.fiemap = ocfs2_fiemap,
};
@@ -2638,6 +2641,7 @@ const struct file_operations ocfs2_fops = {
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
+ .fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops = {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f935fd6600dd..4068c6c4c6f6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -434,7 +434,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* #1 and #2 can be simply solved by never taking the lock
* here for system files (which are the only type we read
* during mount). It's a heavier approach, but our main
- * concern is user-accesible files anyway.
+ * concern is user-accessible files anyway.
*
* #3 works itself out because we'll eventually take the
* cluster lock before trusting anything anyway.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 30c523144452..849fb4a2e814 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -147,7 +147,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
spin_unlock(&oi->ip_lock);
bail_add:
- d_set_d_op(dentry, &ocfs2_dentry_ops);
ret = d_splice_alias(inode, dentry);
if (inode) {
@@ -415,7 +414,6 @@ static int ocfs2_mknod(struct inode *dir,
mlog_errno(status);
goto leave;
}
- d_set_d_op(dentry, &ocfs2_dentry_ops);
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_fe_bh,
@@ -743,7 +741,6 @@ static int ocfs2_link(struct dentry *old_dentry,
}
ihold(inode);
- d_set_d_op(dentry, &ocfs2_dentry_ops);
d_instantiate(dentry, inode);
out_commit:
@@ -1797,7 +1794,6 @@ static int ocfs2_symlink(struct inode *dir,
mlog_errno(status);
goto bail;
}
- d_set_d_op(dentry, &ocfs2_dentry_ops);
status = ocfs2_add_entry(handle, dentry, inode,
le64_to_cpu(fe->i_blkno), parent_fe_bh,
@@ -2462,7 +2458,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto out_commit;
}
- d_set_d_op(dentry, &ocfs2_dentry_ops);
d_instantiate(dentry, inode);
status = 0;
out_commit:
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5fed60de7630..71998d4d61d5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1916,7 +1916,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
- * allocation group. This helps us mantain some
+ * allocation group. This helps us maintain some
* contiguousness across allocations. */
status = ocfs2_search_one_group(ac, handle, bits_wanted,
min_bits, res, &bits_left);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 31c3ffd2f8d0..38f986d2447e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2096,6 +2096,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
+ sb->s_d_op = &ocfs2_dentry_ops;
sb->s_export_op = &ocfs2_export_ops;
sb->s_qcop = &ocfs2_quotactl_ops;
sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023d..e52389e1f05b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -223,7 +223,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EINVAL;
/* Return error if mode is not supported */
- if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ /* Punch hole must have keep size set */
+ if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+ !(mode & FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
if (!(file->f_mode & FMODE_WRITE))
@@ -250,10 +255,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
return -EFBIG;
- if (!inode->i_op->fallocate)
+ if (!file->f_op->fallocate)
return -EOPNOTSUPP;
- return inode->i_op->fallocate(inode, mode, offset, len);
+ return file->f_op->fallocate(file, mode, offset, len);
}
SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e2..9c21119512b9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -237,6 +237,13 @@ ssize_t part_size_show(struct device *dev,
return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
}
+ssize_t part_ro_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
+
ssize_t part_alignment_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -312,6 +319,7 @@ ssize_t part_fail_store(struct device *dev,
static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
NULL);
@@ -326,6 +334,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_partition.attr,
&dev_attr_start.attr,
&dev_attr_size.attr,
+ &dev_attr_ro.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_discard_alignment.attr,
&dev_attr_stat.attr,
@@ -372,6 +381,11 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
put_device(part_to_dev(part));
}
+void __delete_partition(struct hd_struct *part)
+{
+ call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
void delete_partition(struct gendisk *disk, int partno)
{
struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +404,7 @@ void delete_partition(struct gendisk *disk, int partno)
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
- call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+ hd_struct_put(part);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +503,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
+ hd_ref_init(p);
return p;
out_free_info:
@@ -507,65 +522,6 @@ out_put:
return ERR_PTR(err);
}
-/* Not exported, helper to add_disk(). */
-void register_disk(struct gendisk *disk)
-{
- struct device *ddev = disk_to_dev(disk);
- struct block_device *bdev;
- struct disk_part_iter piter;
- struct hd_struct *part;
- int err;
-
- ddev->parent = disk->driverfs_dev;
-
- dev_set_name(ddev, disk->disk_name);
-
- /* delay uevents, until we scanned partition table */
- dev_set_uevent_suppress(ddev, 1);
-
- if (device_add(ddev))
- return;
- if (!sysfs_deprecated) {
- err = sysfs_create_link(block_depr, &ddev->kobj,
- kobject_name(&ddev->kobj));
- if (err) {
- device_del(ddev);
- return;
- }
- }
- disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
- disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
- /* No minors to use for partitions */
- if (!disk_partitionable(disk))
- goto exit;
-
- /* No such device (e.g., media were just removed) */
- if (!get_capacity(disk))
- goto exit;
-
- bdev = bdget_disk(disk, 0);
- if (!bdev)
- goto exit;
-
- bdev->bd_invalidated = 1;
- err = blkdev_get(bdev, FMODE_READ);
- if (err < 0)
- goto exit;
- blkdev_put(bdev, FMODE_READ);
-
-exit:
- /* announce disk after possible partitions are created */
- dev_set_uevent_suppress(ddev, 0);
- kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
- /* announce possible partitions */
- disk_part_iter_init(&piter, disk, 0);
- while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
- disk_part_iter_exit(&piter);
-}
-
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
const struct block_device_operations *bdops = disk->fops;
@@ -728,33 +684,3 @@ fail:
}
EXPORT_SYMBOL(read_dev_sector);
-
-void del_gendisk(struct gendisk *disk)
-{
- struct disk_part_iter piter;
- struct hd_struct *part;
-
- /* invalidate stuff */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
- while ((part = disk_part_iter_next(&piter))) {
- invalidate_partition(disk, part->partno);
- delete_partition(disk, part->partno);
- }
- disk_part_iter_exit(&piter);
-
- invalidate_partition(disk, 0);
- blk_free_devt(disk_to_dev(disk)->devt);
- set_capacity(disk, 0);
- disk->flags &= ~GENHD_FL_UP;
- unlink_gendisk(disk);
- part_stat_set_all(&disk->part0, 0);
- disk->part0.stamp = 0;
-
- kobject_put(disk->part0.holder_dir);
- kobject_put(disk->slave_dir);
- disk->driverfs_dev = NULL;
- if (!sysfs_deprecated)
- sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
- device_del(disk_to_dev(disk));
-}
diff --git a/fs/pipe.c b/fs/pipe.c
index 68f1f8e4e23b..da42f7db50de 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -441,7 +441,7 @@ redo:
break;
}
if (do_wakeup) {
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
pipe_wait(pipe);
@@ -450,7 +450,7 @@ redo:
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
@@ -612,7 +612,7 @@ redo2:
break;
}
if (do_wakeup) {
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
do_wakeup = 0;
}
@@ -623,7 +623,7 @@ redo2:
out:
mutex_unlock(&inode->i_mutex);
if (do_wakeup) {
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0)
@@ -715,7 +715,7 @@ pipe_release(struct inode *inode, int decr, int decw)
if (!pipe->readers && !pipe->writers) {
free_pipe_info(inode);
} else {
- wake_up_interruptible_sync(&pipe->wait);
+ wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
@@ -1004,7 +1004,6 @@ struct file *create_write_pipe(int flags)
goto err_inode;
path.mnt = mntget(pipe_mnt);
- d_set_d_op(path.dentry, &pipefs_dentry_operations);
d_instantiate(path.dentry, inode);
err = -ENFILE;
@@ -1266,7 +1265,8 @@ static const struct super_operations pipefs_ops = {
static struct dentry *pipefs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "pipe:", &pipefs_ops, PIPEFS_MAGIC);
+ return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+ &pipefs_dentry_operations, PIPEFS_MAGIC);
}
static struct file_system_type pipe_fs_type = {
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
static void __exit exit_pipe_fs(void)
{
unregister_filesystem(&pipe_fs_type);
- mntput_long(pipe_mnt);
+ mntput(pipe_mnt);
}
fs_initcall(init_pipe_fs);
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 6a0068841d96..15af6222f8a4 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -1,5 +1,5 @@
config PROC_FS
- bool "/proc file system support" if EMBEDDED
+ bool "/proc file system support" if EXPERT
default y
help
This is a virtual file system providing information about the status
@@ -40,7 +40,7 @@ config PROC_VMCORE
Exports the dump image of crashed kernel in ELF format.
config PROC_SYSCTL
- bool "Sysctl support (/proc/sys)" if EMBEDDED
+ bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
select SYSCTL
default y
@@ -61,7 +61,7 @@ config PROC_SYSCTL
config PROC_PAGE_MONITOR
default y
depends on PROC_FS && MMU
- bool "Enable /proc page monitoring" if EMBEDDED
+ bool "Enable /proc page monitoring" if EXPERT
help
Various /proc files exist to monitor process memory utilization:
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 288a49e098bf..df434c5f28fb 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -10,12 +10,12 @@ proc-$(CONFIG_MMU) := mmu.o task_mmu.o
proc-y += inode.o root.o base.o generic.o array.o \
proc_tty.o
proc-y += cmdline.o
+proc-y += consoles.o
proc-y += cpuinfo.o
proc-y += devices.o
proc-y += interrupts.o
proc-y += loadavg.o
proc-y += meminfo.o
-proc-y += proc_console.o
proc-y += stat.o
proc-y += uptime.o
proc-y += version.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fff6572676ae..df2b703b9d0f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -95,7 +95,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
get_task_comm(tcomm, p);
- seq_printf(m, "Name:\t");
+ seq_puts(m, "Name:\t");
end = m->buf + m->size;
buf = m->buf + m->count;
name = tcomm;
@@ -122,7 +122,7 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
buf++;
}
m->count = buf - m->buf;
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
/*
@@ -208,7 +208,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%d ", GROUP_AT(group_info, g));
put_cred(cred);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
static void render_sigset_t(struct seq_file *m, const char *header,
@@ -216,7 +216,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
{
int i;
- seq_printf(m, "%s", header);
+ seq_puts(m, header);
i = _NSIG;
do {
@@ -230,7 +230,7 @@ static void render_sigset_t(struct seq_file *m, const char *header,
seq_printf(m, "%x", x);
} while (i >= 4);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
@@ -291,12 +291,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
{
unsigned __capi;
- seq_printf(m, "%s", header);
+ seq_puts(m, header);
CAP_FOR_EACH_U32(__capi) {
seq_printf(m, "%08x",
a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
static inline void task_cap(struct seq_file *m, struct task_struct *p)
@@ -329,12 +329,12 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_printf(m, "Cpus_allowed:\t");
+ seq_puts(m, "Cpus_allowed:\t");
seq_cpumask(m, &task->cpus_allowed);
- seq_printf(m, "\n");
- seq_printf(m, "Cpus_allowed_list:\t");
+ seq_putc(m, '\n');
+ seq_puts(m, "Cpus_allowed_list:\t");
seq_cpumask_list(m, &task->cpus_allowed);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
@@ -535,15 +535,15 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
+ unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
- seq_printf(m, "%d %d %d %d %d %d %d\n",
- size, resident, shared, text, lib, data, 0);
+ seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ size, resident, shared, text, data);
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b20962c71a52..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,26 +373,20 @@ static int lstats_show_proc(struct seq_file *m, void *v)
return -ESRCH;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < 32; i++) {
- if (task->latency_record[i].backtrace[0]) {
+ struct latency_record *lr = &task->latency_record[i];
+ if (lr->backtrace[0]) {
int q;
- seq_printf(m, "%i %li %li ",
- task->latency_record[i].count,
- task->latency_record[i].time,
- task->latency_record[i].max);
+ seq_printf(m, "%i %li %li",
+ lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
- char sym[KSYM_SYMBOL_LEN];
- char *c;
- if (!task->latency_record[i].backtrace[q])
+ unsigned long bt = lr->backtrace[q];
+ if (!bt)
break;
- if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+ if (bt == ULONG_MAX)
break;
- sprint_symbol(sym, task->latency_record[i].backtrace[q]);
- c = strchr(sym, '+');
- if (c)
- *c = 0;
- seq_printf(m, "%s ", sym);
+ seq_printf(m, " %ps", (void *)bt);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
}
}
@@ -751,14 +745,7 @@ static int proc_single_show(struct seq_file *m, void *v)
static int proc_single_open(struct inode *inode, struct file *filp)
{
- int ret;
- ret = single_open(filp, proc_single_show, NULL);
- if (!ret) {
- struct seq_file *m = filp->private_data;
-
- m->private = inode;
- }
- return ret;
+ return single_open(filp, proc_single_show, inode);
}
static const struct file_operations proc_single_file_operations = {
@@ -1164,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto err_task_lock;
}
- if (oom_score_adj < task->signal->oom_score_adj &&
+ if (oom_score_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_sighand;
@@ -1177,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
atomic_dec(&task->mm->oom_disable_count);
}
task->signal->oom_score_adj = oom_score_adj;
+ if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ task->signal->oom_score_adj_min = oom_score_adj;
/*
* Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
* always attainable.
@@ -1386,15 +1375,7 @@ sched_write(struct file *file, const char __user *buf,
static int sched_open(struct inode *inode, struct file *filp)
{
- int ret;
-
- ret = single_open(filp, sched_show, NULL);
- if (!ret) {
- struct seq_file *m = filp->private_data;
-
- m->private = inode;
- }
- return ret;
+ return single_open(filp, sched_show, inode);
}
static const struct file_operations proc_pid_sched_operations = {
@@ -1530,15 +1511,7 @@ static int comm_show(struct seq_file *m, void *v)
static int comm_open(struct inode *inode, struct file *filp)
{
- int ret;
-
- ret = single_open(filp, comm_show, NULL);
- if (!ret) {
- struct seq_file *m = filp->private_data;
-
- m->private = inode;
- }
- return ret;
+ return single_open(filp, comm_show, inode);
}
static const struct file_operations proc_pid_set_comm_operations = {
diff --git a/fs/proc/proc_console.c b/fs/proc/consoles.c
index 8a707609f528..eafc22ab1fdd 100644
--- a/fs/proc/proc_console.c
+++ b/fs/proc/consoles.c
@@ -106,9 +106,9 @@ static const struct file_operations proc_consoles_operations = {
.release = seq_release,
};
-static int register_proc_consoles(void)
+static int __init proc_consoles_init(void)
{
proc_create("consoles", 0, NULL, &proc_consoles_operations);
return 0;
}
-module_init(register_proc_consoles);
+module_init(proc_consoles_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da959c9..b14347167c35 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -9,14 +9,14 @@ static int devinfo_show(struct seq_file *f, void *v)
if (i < CHRDEV_MAJOR_HASH_SIZE) {
if (i == 0)
- seq_printf(f, "Character devices:\n");
+ seq_puts(f, "Character devices:\n");
chrdev_show(f, i);
}
#ifdef CONFIG_BLOCK
else {
i -= CHRDEV_MAJOR_HASH_SIZE;
if (i == 0)
- seq_printf(f, "\nBlock devices:\n");
+ seq_puts(f, "\nBlock devices:\n");
blkdev_show(f, i);
}
#endif
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f766be29d2c7..01e07f2a188f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -425,13 +425,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
if (de->namelen != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
- unsigned int ino;
-
- ino = de->low_ino;
pde_get(de);
spin_unlock(&proc_subdir_lock);
error = -EINVAL;
- inode = proc_get_inode(dir->i_sb, ino, de);
+ inode = proc_get_inode(dir->i_sb, de);
goto out_unlock;
}
}
@@ -768,12 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
static void free_proc_entry(struct proc_dir_entry *de)
{
- unsigned int ino = de->low_ino;
-
- if (ino < PROC_DYNAMIC_FIRST)
- return;
-
- release_inode_number(ino);
+ release_inode_number(de->low_ino);
if (S_ISLNK(de->mode))
kfree(de->data);
@@ -834,12 +826,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
wait_for_completion(de->pde_unload_completion);
- goto continue_removing;
+ spin_lock(&de->pde_unload_lock);
}
- spin_unlock(&de->pde_unload_lock);
-continue_removing:
- spin_lock(&de->pde_unload_lock);
while (!list_empty(&de->pde_openers)) {
struct pde_opener *pdeo;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6bcb926b101b..176ce4cda68a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -416,12 +416,11 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
-struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
- struct proc_dir_entry *de)
+struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
struct inode * inode;
- inode = iget_locked(sb, ino);
+ inode = iget_locked(sb, de->low_ino);
if (!inode)
return NULL;
if (inode->i_state & I_NEW) {
@@ -471,7 +470,7 @@ int proc_fill_super(struct super_block *s)
s->s_time_gran = 1;
pde_get(&proc_root);
- root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+ root_inode = proc_get_inode(s, &proc_root);
if (!root_inode)
goto out_no_root;
root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3eddd12..9ad561ded409 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -96,7 +96,8 @@ extern spinlock_t proc_subdir_lock;
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
+unsigned long task_statm(struct mm_struct *,
+ unsigned long *, unsigned long *, unsigned long *, unsigned long *);
void task_mem(struct seq_file *, struct mm_struct *);
static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
@@ -108,7 +109,7 @@ void pde_put(struct proc_dir_entry *pde);
extern struct vfsmount *proc_mnt;
int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
/*
* These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6f37c391468d..d245cb23dd72 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inode, struct file *filp)
static const struct file_operations proc_kcore_operations = {
.read = read_kcore,
.open = open_kcore,
- .llseek = generic_file_llseek,
+ .llseek = default_llseek,
};
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239cfd97e..ed257d141568 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,6 +101,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_MEMORY_FAILURE
"HardwareCorrupted: %5lu kB\n"
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "AnonHugePages: %8lu kB\n"
+#endif
,
K(i.totalram),
K(i.freeram),
@@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(i.freeswap),
K(global_page_state(NR_FILE_DIRTY)),
K(global_page_state(NR_WRITEBACK)),
- K(global_page_state(NR_ANON_PAGES)),
+ K(global_page_state(NR_ANON_PAGES)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR
+#endif
+ ),
K(global_page_state(NR_FILE_MAPPED)),
K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
@@ -151,6 +159,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_MEMORY_FAILURE
,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
+ HPAGE_PMD_NR)
+#endif
);
hugetlb_report_meminfo(m);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3b8b45660331..6d8e6a9e93ab 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -40,7 +40,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
ppage = pfn_to_page(pfn);
else
ppage = NULL;
- if (!ppage)
+ if (!ppage || PageSlab(ppage))
pcount = 0;
else
pcount = page_mapcount(ppage);
@@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page)
if (PageHuge(page))
u |= 1 << KPF_HUGE;
- u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
/*
- * Caveats on high order pages:
- * PG_buddy will only be set on the head page; SLUB/SLQB do the same
- * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+ * Caveats on high order pages: page->_count will only be set
+ * -1 on the head page; SLUB/SLQB do the same for PG_slab;
+ * SLOB won't set PG_slab at all on compound pages.
*/
+ if (PageBuddy(page))
+ u |= 1 << KPF_BUDDY;
+
+ u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
+
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc869437..cb761f010300 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -36,27 +36,27 @@ static void show_tty_range(struct seq_file *m, struct tty_driver *p,
}
switch (p->type) {
case TTY_DRIVER_TYPE_SYSTEM:
- seq_printf(m, "system");
+ seq_puts(m, "system");
if (p->subtype == SYSTEM_TYPE_TTY)
- seq_printf(m, ":/dev/tty");
+ seq_puts(m, ":/dev/tty");
else if (p->subtype == SYSTEM_TYPE_SYSCONS)
- seq_printf(m, ":console");
+ seq_puts(m, ":console");
else if (p->subtype == SYSTEM_TYPE_CONSOLE)
- seq_printf(m, ":vtmaster");
+ seq_puts(m, ":vtmaster");
break;
case TTY_DRIVER_TYPE_CONSOLE:
- seq_printf(m, "console");
+ seq_puts(m, "console");
break;
case TTY_DRIVER_TYPE_SERIAL:
- seq_printf(m, "serial");
+ seq_puts(m, "serial");
break;
case TTY_DRIVER_TYPE_PTY:
if (p->subtype == PTY_TYPE_MASTER)
- seq_printf(m, "pty:master");
+ seq_puts(m, "pty:master");
else if (p->subtype == PTY_TYPE_SLAVE)
- seq_printf(m, "pty:slave");
+ seq_puts(m, "pty:slave");
else
- seq_printf(m, "pty");
+ seq_puts(m, "pty");
break;
default:
seq_printf(m, "type:%d.%d", p->type, p->subtype);
@@ -74,19 +74,19 @@ static int show_tty_driver(struct seq_file *m, void *v)
/* pseudo-drivers first */
seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0);
- seq_printf(m, "system:/dev/tty\n");
+ seq_puts(m, "system:/dev/tty\n");
seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console");
seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1);
- seq_printf(m, "system:console\n");
+ seq_puts(m, "system:console\n");
#ifdef CONFIG_UNIX98_PTYS
seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx");
seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2);
- seq_printf(m, "system\n");
+ seq_puts(m, "system\n");
#endif
#ifdef CONFIG_VT
seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0");
seq_printf(m, "%3d %7d ", TTY_MAJOR, 0);
- seq_printf(m, "system:vtmaster\n");
+ seq_puts(m, "system:vtmaster\n");
#endif
}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 37994737c983..62604be9f58d 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -10,16 +10,16 @@ static int show_softirqs(struct seq_file *p, void *v)
{
int i, j;
- seq_printf(p, " ");
+ seq_puts(p, " ");
for_each_possible_cpu(i)
seq_printf(p, "CPU%-8d", i);
- seq_printf(p, "\n");
+ seq_putc(p, '\n');
for (i = 0; i < NR_SOFTIRQS; i++) {
seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
- seq_printf(p, "\n");
+ seq_putc(p, '\n');
}
return 0;
}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e15a19c93bae..1cffa2b8a2fc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -126,7 +126,7 @@ static int show_stat(struct seq_file *p, void *v)
for (i = 0; i < NR_SOFTIRQS; i++)
seq_printf(p, " %u", per_softirq_sums[i]);
- seq_printf(p, "\n");
+ seq_putc(p, '\n');
return 0;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c126c83b9a45..60b914860f81 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -66,8 +66,9 @@ unsigned long task_vsize(struct mm_struct *mm)
return PAGE_SIZE * mm->total_vm;
}
-int task_statm(struct mm_struct *mm, int *shared, int *text,
- int *data, int *resident)
+unsigned long task_statm(struct mm_struct *mm,
+ unsigned long *shared, unsigned long *text,
+ unsigned long *data, unsigned long *resident)
{
*shared = get_mm_counter(mm, MM_FILEPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
@@ -417,7 +418,8 @@ static int show_smap(struct seq_file *m, void *v)
"Anonymous: %8lu kB\n"
"Swap: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n",
+ "MMUPageSize: %8lu kB\n"
+ "Locked: %8lu kB\n",
(vma->vm_end - vma->vm_start) >> 10,
mss.resident >> 10,
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -429,7 +431,9 @@ static int show_smap(struct seq_file *m, void *v)
mss.anonymous >> 10,
mss.swap >> 10,
vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10);
+ vma_mmu_pagesize(vma) >> 10,
+ (vma->vm_flags & VM_LOCKED) ?
+ (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index cb6306e63843..b535d3e5d5f1 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -92,13 +92,14 @@ unsigned long task_vsize(struct mm_struct *mm)
return vsize;
}
-int task_statm(struct mm_struct *mm, int *shared, int *text,
- int *data, int *resident)
+unsigned long task_statm(struct mm_struct *mm,
+ unsigned long *shared, unsigned long *text,
+ unsigned long *data, unsigned long *resident)
{
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *p;
- int size = kobjsize(mm);
+ unsigned long size = kobjsize(mm);
down_read(&mm->mmap_sem);
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index 5d431bacbea9..5520f8ad5504 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -30,18 +30,9 @@ const struct file_operations generic_ro_fops = {
EXPORT_SYMBOL(generic_ro_fops);
-static int
-__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+static inline int unsigned_offsets(struct file *file)
{
- /*
- * pos or pos+count is negative here, check overflow.
- * too big "count" will be caught in rw_verify_area().
- */
- if ((pos < 0) && (pos + count < pos))
- return -EOVERFLOW;
- if (file->f_mode & FMODE_UNSIGNED_OFFSET)
- return 0;
- return -EINVAL;
+ return file->f_mode & FMODE_UNSIGNED_OFFSET;
}
/**
@@ -75,7 +66,7 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
break;
}
- if (offset < 0 && __negative_fpos_check(file, offset, 0))
+ if (offset < 0 && !unsigned_offsets(file))
return -EINVAL;
if (offset > inode->i_sb->s_maxbytes)
return -EINVAL;
@@ -152,7 +143,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
offset += file->f_pos;
}
retval = -EINVAL;
- if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
+ if (offset >= 0 || unsigned_offsets(file)) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
@@ -252,9 +243,13 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
if (unlikely((ssize_t) count < 0))
return retval;
pos = *ppos;
- if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
- retval = __negative_fpos_check(file, pos, count);
- if (retval)
+ if (unlikely(pos < 0)) {
+ if (!unsigned_offsets(file))
+ return retval;
+ if (count >= -pos) /* both values are in 0..LLONG_MAX */
+ return -EOVERFLOW;
+ } else if (unlikely((loff_t) (pos + count) < 0)) {
+ if (!unsigned_offsets(file))
return retval;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d31bce1a9f90..3eea859e6990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2551,8 +2551,6 @@ static int release_journal_dev(struct super_block *super,
result = 0;
if (journal->j_dev_bd != NULL) {
- if (journal->j_dev_bd->bd_dev != super->s_dev)
- bd_release(journal->j_dev_bd);
result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
journal->j_dev_bd = NULL;
}
@@ -2570,7 +2568,7 @@ static int journal_init_dev(struct super_block *super,
{
int result;
dev_t jdev;
- fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+ fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
char b[BDEVNAME_SIZE];
result = 0;
@@ -2584,7 +2582,10 @@ static int journal_init_dev(struct super_block *super,
/* there is no "jdev" option and journal is on separate device */
if ((!jdev_name || !jdev_name[0])) {
- journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+ if (jdev == super->s_dev)
+ blkdev_mode &= ~FMODE_EXCL;
+ journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+ journal);
journal->j_dev_mode = blkdev_mode;
if (IS_ERR(journal->j_dev_bd)) {
result = PTR_ERR(journal->j_dev_bd);
@@ -2593,22 +2594,14 @@ static int journal_init_dev(struct super_block *super,
"cannot init journal device '%s': %i",
__bdevname(jdev, b), result);
return result;
- } else if (jdev != super->s_dev) {
- result = bd_claim(journal->j_dev_bd, journal);
- if (result) {
- blkdev_put(journal->j_dev_bd, blkdev_mode);
- return result;
- }
-
+ } else if (jdev != super->s_dev)
set_blocksize(journal->j_dev_bd, super->s_blocksize);
- }
return 0;
}
journal->j_dev_mode = blkdev_mode;
- journal->j_dev_bd = open_bdev_exclusive(jdev_name,
- blkdev_mode, journal);
+ journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
if (IS_ERR(journal->j_dev_bd)) {
result = PTR_ERR(journal->j_dev_bd);
journal->j_dev_bd = NULL;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index adbc6f538515..45de98b59466 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int l
va_list args;
int mode, first, last;
- va_start(args, bh);
-
if (!bh) {
printk("print_block: buffer is NULL\n");
return;
}
+ va_start(args, bh);
+
mode = va_arg(args, int);
first = va_arg(args, int);
last = va_arg(args, int);
diff --git a/fs/select.c b/fs/select.c
index b7b10aa30861..e56560d2b08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -306,6 +306,8 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
rts.tv_sec = rts.tv_nsec = 0;
if (timeval) {
+ if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
+ memset(&rtv, 0, sizeof(rtv));
rtv.tv_sec = rts.tv_sec;
rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
diff --git a/fs/splice.c b/fs/splice.c
index ce2f02579e35..50a5d978da16 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -682,19 +682,14 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
{
struct file *file = sd->u.file;
loff_t pos = sd->pos;
- int ret, more;
-
- ret = buf->ops->confirm(pipe, buf);
- if (!ret) {
- more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
- if (file->f_op && file->f_op->sendpage)
- ret = file->f_op->sendpage(file, buf->page, buf->offset,
- sd->len, &pos, more);
- else
- ret = -EINVAL;
- }
+ int more;
- return ret;
+ if (!likely(file->f_op && file->f_op->sendpage))
+ return -EINVAL;
+
+ more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+ return file->f_op->sendpage(file, buf->page, buf->offset,
+ sd->len, &pos, more);
}
/*
@@ -727,13 +722,6 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
void *fsdata;
int ret;
- /*
- * make sure the data in this buffer is uptodate
- */
- ret = buf->ops->confirm(pipe, buf);
- if (unlikely(ret))
- return ret;
-
offset = sd->pos & ~PAGE_CACHE_MASK;
this_len = sd->len;
@@ -805,12 +793,17 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
if (sd->len > sd->total_len)
sd->len = sd->total_len;
- ret = actor(pipe, buf, sd);
- if (ret <= 0) {
+ ret = buf->ops->confirm(pipe, buf);
+ if (unlikely(ret)) {
if (ret == -ENODATA)
ret = 0;
return ret;
}
+
+ ret = actor(pipe, buf, sd);
+ if (ret <= 0)
+ return ret;
+
buf->offset += ret;
buf->len -= ret;
@@ -1044,10 +1037,6 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
int ret;
void *data;
- ret = buf->ops->confirm(pipe, buf);
- if (ret)
- return ret;
-
data = buf->ops->map(pipe, buf, 0);
ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
buf->ops->unmap(pipe, buf, data);
@@ -1495,10 +1484,6 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
char *src;
int ret;
- ret = buf->ops->confirm(pipe, buf);
- if (unlikely(ret))
- return ret;
-
/*
* See if we can use the atomic maps, by prefaulting in the
* pages and doing an atomic copy
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index e5f63da64d04..aa68a8a31518 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -29,7 +29,6 @@ config SQUASHFS
config SQUASHFS_XATTR
bool "Squashfs XATTR support"
depends on SQUASHFS
- default n
help
Saying Y here includes support for extended attributes (xattrs).
Xattrs are name:value pairs associated with inodes by
@@ -40,7 +39,6 @@ config SQUASHFS_XATTR
config SQUASHFS_LZO
bool "Include support for LZO compressed file systems"
depends on SQUASHFS
- default n
select LZO_DECOMPRESS
help
Saying Y here includes support for reading Squashfs file systems
@@ -53,10 +51,24 @@ config SQUASHFS_LZO
If unsure, say N.
+config SQUASHFS_XZ
+ bool "Include support for XZ compressed file systems"
+ depends on SQUASHFS
+ select XZ_DEC
+ help
+ Saying Y here includes support for reading Squashfs file systems
+ compressed with XZ compresssion. XZ gives better compression than
+ the default zlib compression, at the expense of greater CPU and
+ memory overhead.
+
+ XZ is not the standard compression used in Squashfs and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config SQUASHFS_EMBEDDED
bool "Additional option for memory-constrained systems"
depends on SQUASHFS
- default n
help
Saying Y here allows you to specify cache size.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7672bac8d328..cecf2bea07af 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -7,3 +7,4 @@ squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
+squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 653c030eb840..2fb2882f0fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,7 +34,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "decompressor.h"
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 57314bee9059..26b15ae34d6f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -55,7 +55,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
/*
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 24af9ce9722f..a5940e54c4dd 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -27,7 +27,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "decompressor.h"
#include "squashfs.h"
@@ -41,23 +40,26 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
};
#ifndef CONFIG_SQUASHFS_LZO
-static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
};
#endif
+#ifndef CONFIG_SQUASHFS_XZ
+static const struct squashfs_decompressor squashfs_xz_comp_ops = {
+ NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+};
+#endif
+
static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
NULL, NULL, NULL, 0, "unknown", 0
};
static const struct squashfs_decompressor *decompressor[] = {
&squashfs_zlib_comp_ops,
- &squashfs_lzma_unsupported_comp_ops,
-#ifdef CONFIG_SQUASHFS_LZO
&squashfs_lzo_comp_ops,
-#else
- &squashfs_lzo_unsupported_comp_ops,
-#endif
+ &squashfs_xz_comp_ops,
+ &squashfs_lzma_unsupported_comp_ops,
&squashfs_unknown_comp_ops
};
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 7425f80783f6..3b305a70f7aa 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -52,4 +52,13 @@ static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
length, srclength, pages);
}
+
+#ifdef CONFIG_SQUASHFS_XZ
+extern const struct squashfs_decompressor squashfs_xz_comp_ops;
+#endif
+
+#ifdef CONFIG_SQUASHFS_LZO
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
+#endif
+
#endif
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 7c90bbd6879d..7eef571443c6 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -39,7 +39,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
/*
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index b7f64bcd2b70..d8f32452638e 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -37,7 +37,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
/*
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 5d87789bf1c1..7da759e34c52 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -29,7 +29,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "decompressor.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 5d45569d5f72..ba729d808876 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -27,11 +27,6 @@
#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
-static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
-{
- return list_entry(inode, struct squashfs_inode_info, vfs_inode);
-}
-
/* block.c */
extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
int, int);
@@ -104,6 +99,3 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
/* zlib_wrapper.c */
extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
-
-/* lzo_wrapper.c */
-extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index c5137fc9ab11..39533feffd6d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -238,6 +238,7 @@ struct meta_index {
#define ZLIB_COMPRESSION 1
#define LZMA_COMPRESSION 2
#define LZO_COMPRESSION 3
+#define XZ_COMPRESSION 4
struct squashfs_super_block {
__le32 s_magic;
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index d3e3a37f28a1..359baefc01fc 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -45,4 +45,10 @@ struct squashfs_inode_info {
};
struct inode vfs_inode;
};
+
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+ return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d33be5dd6c32..05385dbe1465 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -32,7 +32,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "xattr.h"
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
new file mode 100644
index 000000000000..856756ca5ee4
--- /dev/null
+++ b/fs/squashfs/xz_wrapper.c
@@ -0,0 +1,153 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xz_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/xz.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+
+struct squashfs_xz {
+ struct xz_dec *state;
+ struct xz_buf buf;
+};
+
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+{
+ int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+
+ struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+ if (stream == NULL)
+ goto failed;
+
+ stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+ if (stream->state == NULL)
+ goto failed;
+
+ return stream;
+
+failed:
+ ERROR("Failed to allocate xz workspace\n");
+ kfree(stream);
+ return NULL;
+}
+
+
+static void squashfs_xz_free(void *strm)
+{
+ struct squashfs_xz *stream = strm;
+
+ if (stream) {
+ xz_dec_end(stream->state);
+ kfree(stream);
+ }
+}
+
+
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+ struct buffer_head **bh, int b, int offset, int length, int srclength,
+ int pages)
+{
+ enum xz_ret xz_err;
+ int avail, total = 0, k = 0, page = 0;
+ struct squashfs_xz *stream = msblk->stream;
+
+ mutex_lock(&msblk->read_data_mutex);
+
+ xz_dec_reset(stream->state);
+ stream->buf.in_pos = 0;
+ stream->buf.in_size = 0;
+ stream->buf.out_pos = 0;
+ stream->buf.out_size = PAGE_CACHE_SIZE;
+ stream->buf.out = buffer[page++];
+
+ do {
+ if (stream->buf.in_pos == stream->buf.in_size && k < b) {
+ avail = min(length, msblk->devblksize - offset);
+ length -= avail;
+ wait_on_buffer(bh[k]);
+ if (!buffer_uptodate(bh[k]))
+ goto release_mutex;
+
+ if (avail == 0) {
+ offset = 0;
+ put_bh(bh[k++]);
+ continue;
+ }
+
+ stream->buf.in = bh[k]->b_data + offset;
+ stream->buf.in_size = avail;
+ stream->buf.in_pos = 0;
+ offset = 0;
+ }
+
+ if (stream->buf.out_pos == stream->buf.out_size
+ && page < pages) {
+ stream->buf.out = buffer[page++];
+ stream->buf.out_pos = 0;
+ total += PAGE_CACHE_SIZE;
+ }
+
+ xz_err = xz_dec_run(stream->state, &stream->buf);
+
+ if (stream->buf.in_pos == stream->buf.in_size && k < b)
+ put_bh(bh[k++]);
+ } while (xz_err == XZ_OK);
+
+ if (xz_err != XZ_STREAM_END) {
+ ERROR("xz_dec_run error, data probably corrupt\n");
+ goto release_mutex;
+ }
+
+ if (k < b) {
+ ERROR("xz_uncompress error, input remaining\n");
+ goto release_mutex;
+ }
+
+ total += stream->buf.out_pos;
+ mutex_unlock(&msblk->read_data_mutex);
+ return total;
+
+release_mutex:
+ mutex_unlock(&msblk->read_data_mutex);
+
+ for (; k < b; k++)
+ put_bh(bh[k]);
+
+ return -EIO;
+}
+
+const struct squashfs_decompressor squashfs_xz_comp_ops = {
+ .init = squashfs_xz_init,
+ .free = squashfs_xz_free,
+ .decompress = squashfs_xz_uncompress,
+ .id = XZ_COMPRESSION,
+ .name = "xz",
+ .supported = 1
+};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 7a603874e483..818a5e063faf 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -29,7 +29,6 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "decompressor.h"
@@ -66,8 +65,8 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
struct buffer_head **bh, int b, int offset, int length, int srclength,
int pages)
{
- int zlib_err = 0, zlib_init = 0;
- int avail, bytes, k = 0, page = 0;
+ int zlib_err, zlib_init = 0;
+ int k = 0, page = 0;
z_stream *stream = msblk->stream;
mutex_lock(&msblk->read_data_mutex);
@@ -75,11 +74,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
stream->avail_out = 0;
stream->avail_in = 0;
- bytes = length;
do {
if (stream->avail_in == 0 && k < b) {
- avail = min(bytes, msblk->devblksize - offset);
- bytes -= avail;
+ int avail = min(length, msblk->devblksize - offset);
+ length -= avail;
wait_on_buffer(bh[k]);
if (!buffer_uptodate(bh[k]))
goto release_mutex;
@@ -128,6 +126,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
goto release_mutex;
}
+ if (k < b) {
+ ERROR("zlib_uncompress error, data remaining\n");
+ goto release_mutex;
+ }
+
length = stream->total_out;
mutex_unlock(&msblk->read_data_mutex);
return length;
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e213900..d5c61cf2b703 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,11 +75,13 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
int error = -EINVAL;
int lookup_flags = 0;
- if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+ if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
goto out;
if (!(flag & AT_SYMLINK_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+ if (flag & AT_NO_AUTOMOUNT)
+ lookup_flags |= LOOKUP_NO_AUTOMOUNT;
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
diff --git a/fs/super.c b/fs/super.c
index 823e061faa87..74e149efed81 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -767,13 +767,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
{
struct block_device *bdev;
struct super_block *s;
- fmode_t mode = FMODE_READ;
+ fmode_t mode = FMODE_READ | FMODE_EXCL;
int error = 0;
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
- bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+ bdev = blkdev_get_by_path(dev_name, mode, fs_type);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
@@ -802,13 +802,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
/*
* s_umount nests inside bd_mutex during
- * __invalidate_device(). close_bdev_exclusive()
- * acquires bd_mutex and can't be called under
- * s_umount. Drop s_umount temporarily. This is safe
- * as we're holding an active reference.
+ * __invalidate_device(). blkdev_put() acquires
+ * bd_mutex and can't be called under s_umount. Drop
+ * s_umount temporarily. This is safe as we're
+ * holding an active reference.
*/
up_write(&s->s_umount);
- close_bdev_exclusive(bdev, mode);
+ blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
char b[BDEVNAME_SIZE];
@@ -832,7 +832,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
error_s:
error = PTR_ERR(s);
error_bdev:
- close_bdev_exclusive(bdev, mode);
+ blkdev_put(bdev, mode);
error:
return ERR_PTR(error);
}
@@ -863,7 +863,8 @@ void kill_block_super(struct super_block *sb)
bdev->bd_super = NULL;
generic_shutdown_super(sb);
sync_blockdev(bdev);
- close_bdev_exclusive(bdev, mode);
+ WARN_ON_ONCE(!(mode & FMODE_EXCL));
+ blkdev_put(bdev, mode | FMODE_EXCL);
}
EXPORT_SYMBOL(kill_block_super);
@@ -1140,7 +1141,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
return mnt;
err:
- mntput_long(mnt);
+ mntput(mnt);
return ERR_PTR(err);
}
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index f4b67588b9d6..8c41feacbac5 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,5 +1,5 @@
config SYSFS
- bool "sysfs file system support" if EMBEDDED
+ bool "sysfs file system support" if EXPERT
default y
help
The sysfs filesystem is a virtual filesystem that the kernel uses to
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b5e68da2db32..b427b1208c26 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -48,7 +48,6 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, st
struct inode * inode = NULL;
ino_t ino;
- d_set_d_op(dentry, dir->i_sb->s_root->d_op);
if (dentry->d_name.len > SYSV_NAMELEN)
return ERR_PTR(-ENAMETOOLONG);
ino = sysv_inode_by_name(dentry);
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 76712aefc4ab..f60c196913ea 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -332,6 +332,10 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
/* set up enough so that it can read an inode */
sb->s_op = &sysv_sops;
+ if (sbi->s_forced_ro)
+ sb->s_flags |= MS_RDONLY;
+ if (sbi->s_truncate)
+ sb->s_d_op = &sysv_dentry_operations;
root_inode = sysv_iget(sb, SYSV_ROOT_INO);
if (IS_ERR(root_inode)) {
printk("SysV FS: get root inode failed\n");
@@ -343,10 +347,6 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
printk("SysV FS: get root dentry failed\n");
return 0;
}
- if (sbi->s_forced_ro)
- sb->s_flags |= MS_RDONLY;
- if (sbi->s_truncate)
- d_set_d_op(sb->s_root, &sysv_dentry_operations);
return 1;
}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0dce969d6cad..faca44997099 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -98,6 +98,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
kmem.o \
xfs_aops.o \
xfs_buf.o \
+ xfs_discard.o \
xfs_export.o \
xfs_file.o \
xfs_fs_subr.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2acc6ab..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,6 @@ xfs_buf_rele(
trace_xfs_buf_rele(bp, _RET_IP_);
if (!pag) {
- ASSERT(!bp->b_relse);
ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
- if (bp->b_relse) {
- atomic_inc(&bp->b_hold);
- spin_unlock(&pag->pag_buf_lock);
- bp->b_relse(bp);
- } else if (!(bp->b_flags & XBF_STALE) &&
+ if (!(bp->b_flags & XBF_STALE) &&
atomic_read(&bp->b_lru_ref)) {
xfs_buf_lru_add(bp);
spin_unlock(&pag->pag_buf_lock);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a76c2428faff..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
struct xfs_buf;
typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
#define XB_PAGES 2
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
void *b_addr; /* virtual address of buffer */
struct work_struct b_iodone_work;
xfs_buf_iodone_t b_iodone; /* I/O completion function */
- xfs_buf_relse_t b_relse; /* releasing function */
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
#define XFS_BUF_SET_START(bp) do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
static inline void xfs_buf_relse(xfs_buf_t *bp)
{
- if (!bp->b_relse)
- xfs_buf_unlock(bp);
+ xfs_buf_unlock(bp);
xfs_buf_rele(bp);
}
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_fsblock_t start,
+ xfs_fsblock_t len,
+ xfs_fsblock_t minlen,
+ __uint64_t *blocks_trimmed)
+{
+ struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_perag *pag;
+ int error;
+ int i;
+
+ pag = xfs_perag_get(mp, agno);
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error || !agbp)
+ goto out_put_perag;
+
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+ /*
+ * Force out the log. This means any transactions that might have freed
+ * space before we took the AGF buffer lock are now on disk, and the
+ * volatile disk cache is flushed.
+ */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * Look up the longest btree in the AGF and start with it.
+ */
+ error = xfs_alloc_lookup_le(cur, 0,
+ XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+ if (error)
+ goto out_del_cursor;
+
+ /*
+ * Loop until we are done with all extents that are large
+ * enough to be worth discarding.
+ */
+ while (i) {
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+
+ error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+ if (error)
+ goto out_del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+ ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+
+ /*
+ * Too small? Give up.
+ */
+ if (flen < minlen) {
+ trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+ goto out_del_cursor;
+ }
+
+ /*
+ * If the extent is entirely outside of the range we are
+ * supposed to discard skip it. Do not bother to trim
+ * down partially overlapping ranges for now.
+ */
+ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+ XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+ trace_xfs_discard_exclude(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ /*
+ * If any blocks in the range are still busy, skip the
+ * discard and try again the next time.
+ */
+ if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+ trace_xfs_discard_busy(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ trace_xfs_discard_extent(mp, agno, fbno, flen);
+ error = -blkdev_issue_discard(bdev,
+ XFS_AGB_TO_DADDR(mp, agno, fbno),
+ XFS_FSB_TO_BB(mp, flen),
+ GFP_NOFS, 0);
+ if (error)
+ goto out_del_cursor;
+ *blocks_trimmed += flen;
+
+next_extent:
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto out_del_cursor;
+ }
+
+out_del_cursor:
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+out_put_perag:
+ xfs_perag_put(pag);
+ return error;
+}
+
+int
+xfs_ioc_trim(
+ struct xfs_mount *mp,
+ struct fstrim_range __user *urange)
+{
+ struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+ unsigned int granularity = q->limits.discard_granularity;
+ struct fstrim_range range;
+ xfs_fsblock_t start, len, minlen;
+ xfs_agnumber_t start_agno, end_agno, agno;
+ __uint64_t blocks_trimmed = 0;
+ int error, last_error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&range, urange, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+
+ /*
+ * Truncating down the len isn't actually quite correct, but using
+ * XFS_B_TO_FSB would mean we trivially get overflows for values
+ * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
+ * used by the fstrim application. In the end it really doesn't
+ * matter as trimming blocks is an advisory interface.
+ */
+ start = XFS_B_TO_FSBT(mp, range.start);
+ len = XFS_B_TO_FSBT(mp, range.len);
+ minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+ start_agno = XFS_FSB_TO_AGNO(mp, start);
+ if (start_agno >= mp->m_sb.sb_agcount)
+ return -XFS_ERROR(EINVAL);
+
+ end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+ if (end_agno >= mp->m_sb.sb_agcount)
+ end_agno = mp->m_sb.sb_agcount - 1;
+
+ for (agno = start_agno; agno <= end_agno; agno++) {
+ error = -xfs_trim_extents(mp, agno, start, len, minlen,
+ &blocks_trimmed);
+ if (error)
+ last_error = error;
+ }
+
+ if (last_error)
+ return last_error;
+
+ range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+ if (copy_to_user(urange, &range, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+
+extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..a55c1b46b219 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -37,10 +37,45 @@
#include "xfs_trace.h"
#include <linux/dcache.h>
+#include <linux/falloc.h>
static const struct vm_operations_struct xfs_file_vm_ops;
/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+ struct xfs_inode *ip,
+ int type)
+{
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_lock(&VFS_I(ip)->i_mutex);
+ xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_iunlock(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_ilock_demote(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
* xfs_iozero
*
* xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +297,21 @@ xfs_file_aio_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (unlikely(ioflags & IO_ISDIRECT))
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
if (unlikely(ioflags & IO_ISDIRECT)) {
+ xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
if (inode->i_mapping->nrpages) {
ret = -xfs_flushinval_pages(ip,
(iocb->ki_pos & PAGE_CACHE_MASK),
-1, FI_REMAPF_LOCKED);
+ if (ret) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+ return ret;
+ }
}
- mutex_unlock(&inode->i_mutex);
- if (ret) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return ret;
- }
- }
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ } else
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +319,7 @@ xfs_file_aio_read(
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -309,7 +343,7 @@ xfs_file_splice_read(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +351,61 @@ xfs_file_splice_read(
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
+STATIC void
+xfs_aio_write_isize_update(
+ struct inode *inode,
+ loff_t *ppos,
+ ssize_t bytes_written)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fsize_t isize = i_size_read(inode);
+
+ if (bytes_written > 0)
+ XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+ if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+ *ppos > isize))
+ *ppos = isize;
+
+ if (*ppos > ip->i_size) {
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ if (*ppos > ip->i_size)
+ ip->i_size = *ppos;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured. In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+ struct xfs_inode *ip)
+{
+ if (ip->i_new_size) {
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ ip->i_new_size = 0;
+ if (ip->i_d.di_size > ip->i_size)
+ ip->i_d.di_size = ip->i_size;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
STATIC ssize_t
xfs_file_splice_write(
struct pipe_inode_info *pipe,
@@ -331,7 +416,7 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t isize, new_size;
+ xfs_fsize_t new_size;
int ioflags = 0;
ssize_t ret;
@@ -355,27 +440,9 @@ xfs_file_splice_write(
trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- if (ip->i_new_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ xfs_aio_write_isize_update(inode, ppos, ret);
+ xfs_aio_write_newsize_update(ip);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
@@ -562,247 +629,314 @@ out_lock:
return error;
}
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
STATIC ssize_t
-xfs_file_aio_write(
- struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+xfs_file_aio_write_checks(
+ struct file *file,
+ loff_t *pos,
+ size_t *count,
+ int *iolock)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0, error = 0;
- int ioflags = 0;
- xfs_fsize_t isize, new_size;
- int iolock;
- size_t ocount = 0, count;
- int need_i_mutex;
+ xfs_fsize_t new_size;
+ int error = 0;
- XFS_STATS_INC(xs_write_calls);
+ error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+ if (error) {
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ *iolock = 0;
+ return error;
+ }
- BUG_ON(iocb->ki_pos != pos);
+ new_size = *pos + *count;
+ if (new_size > ip->i_size)
+ ip->i_new_size = new_size;
- if (unlikely(file->f_flags & O_DIRECT))
- ioflags |= IO_ISDIRECT;
- if (file->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
+ if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+ file_update_time(file);
+
+ /*
+ * If the offset is beyond the size of the file, we need to zero any
+ * blocks that fall between the existing EOF and the start of this
+ * write.
+ */
+ if (*pos > ip->i_size)
+ error = -xfs_zero_eof(ip, *pos, ip->i_size);
- error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
- count = ocount;
- if (count == 0)
- return 0;
-
- xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+ /*
+ * If we're writing the file then make sure to clear the setuid and
+ * setgid bits if the process is not being run by root. This keeps
+ * people from modifying setuid and setgid binaries.
+ */
+ return file_remove_suid(file);
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
+}
-relock:
- if (ioflags & IO_ISDIRECT) {
- iolock = XFS_IOLOCK_SHARED;
- need_i_mutex = 0;
- } else {
- iolock = XFS_IOLOCK_EXCL;
- need_i_mutex = 1;
- mutex_lock(&inode->i_mutex);
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer. To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos,
+ size_t ocount,
+ int *iolock)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ size_t count = ocount;
+ int unaligned_io = 0;
+ struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ *iolock = 0;
+ if ((pos & target->bt_smask) || (count & target->bt_smask))
+ return -XFS_ERROR(EINVAL);
+
+ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+ unaligned_io = 1;
+
+ if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+ *iolock = XFS_IOLOCK_EXCL;
+ else
+ *iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ if (ret)
+ return ret;
+
+ if (mapping->nrpages) {
+ WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+ ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+ FI_REMAPF_LOCKED);
+ if (ret)
+ return ret;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-
-start:
- error = -generic_write_checks(file, &pos, &count,
- S_ISBLK(inode->i_mode));
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
- goto out_unlock_mutex;
+ /*
+ * If we are doing unaligned IO, wait for all other IO to drain,
+ * otherwise demote the lock if we had to flush cached pages
+ */
+ if (unaligned_io)
+ xfs_ioend_wait(ip);
+ else if (*iolock == XFS_IOLOCK_EXCL) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ *iolock = XFS_IOLOCK_SHARED;
}
- if (ioflags & IO_ISDIRECT) {
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_direct_write(iocb, iovp,
+ &nr_segs, pos, &iocb->ki_pos, count, ocount);
- if ((pos & target->bt_smask) || (count & target->bt_smask)) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
- return XFS_ERROR(-EINVAL);
- }
+ /* No fallback to buffered IO on errors for XFS. */
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
- if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
- iolock = XFS_IOLOCK_EXCL;
- need_i_mutex = 1;
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
- goto start;
- }
- }
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos,
+ size_t ocount,
+ int *iolock)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int enospc = 0;
+ size_t count = ocount;
- new_size = pos + count;
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
- if (likely(!(ioflags & IO_INVIS)))
- file_update_time(file);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ if (ret)
+ return ret;
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+ trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+ pos, &iocb->ki_pos, count, ret);
/*
- * If the offset is beyond the size of the file, we have a couple
- * of things to do. First, if there is already space allocated
- * we need to either create holes or zero the disk or ...
- *
- * If there is a page where the previous size lands, we need
- * to zero it out up to the new size.
+ * if we just got an ENOSPC, flush the inode now we aren't holding any
+ * page locks and retry *once*
*/
-
- if (pos > ip->i_size) {
- error = xfs_zero_eof(ip, pos, ip->i_size);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out_unlock_internal;
- }
+ if (ret == -ENOSPC && !enospc) {
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (ret)
+ return ret;
+ enospc = 1;
+ goto write_retry;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ current->backing_dev_info = NULL;
+ return ret;
+}
- /*
- * If we're writing the file then make sure to clear the
- * setuid and setgid bits if the process is not being run
- * by root. This keeps people from modifying setuid and
- * setgid binaries.
- */
- error = -file_remove_suid(file);
- if (unlikely(error))
- goto out_unlock_internal;
+STATIC ssize_t
+xfs_file_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int iolock;
+ size_t ocount = 0;
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ XFS_STATS_INC(xs_write_calls);
- if ((ioflags & IO_ISDIRECT)) {
- if (mapping->nrpages) {
- WARN_ON(need_i_mutex == 0);
- error = xfs_flushinval_pages(ip,
- (pos & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
- if (error)
- goto out_unlock_internal;
- }
+ BUG_ON(iocb->ki_pos != pos);
- if (need_i_mutex) {
- /* demote the lock now the cached pages are gone */
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- mutex_unlock(&inode->i_mutex);
+ ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ if (ret)
+ return ret;
- iolock = XFS_IOLOCK_SHARED;
- need_i_mutex = 0;
- }
+ if (ocount == 0)
+ return 0;
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+ xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
- /*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
- */
- if (ret >= 0 && ret != count) {
- XFS_STATS_ADD(xs_write_bytes, ret);
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
- pos += ret;
- count -= ret;
+ if (unlikely(file->f_flags & O_DIRECT))
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+ ocount, &iolock);
+ else
+ ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+ ocount, &iolock);
- ioflags &= ~IO_ISDIRECT;
- xfs_iunlock(ip, iolock);
- goto relock;
- }
- } else {
- int enospc = 0;
- ssize_t ret2 = 0;
+ xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-write_retry:
- trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
- ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, ret);
- /*
- * if we just got an ENOSPC, flush the inode now we
- * aren't holding any page locks and retry *once*
- */
- if (ret2 == -ENOSPC && !enospc) {
- error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (error)
- goto out_unlock_internal;
- enospc = 1;
- goto write_retry;
- }
- ret = ret2;
- }
+ if (ret <= 0)
+ goto out_unlock;
- current->backing_dev_info = NULL;
+ /* Handle various SYNC-type writes */
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
+ int error, error2;
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
- iocb->ki_pos = isize;
+ xfs_rw_iunlock(ip, iolock);
+ error = filemap_write_and_wait_range(mapping, pos, end);
+ xfs_rw_ilock(ip, iolock);
- if (iocb->ki_pos > ip->i_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (iocb->ki_pos > ip->i_size)
- ip->i_size = iocb->ki_pos;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error2 = -xfs_file_fsync(file,
+ (file->f_flags & __O_SYNC) ? 0 : 1);
+ if (error)
+ ret = error;
+ else if (error2)
+ ret = error2;
}
- error = -ret;
- if (ret <= 0)
- goto out_unlock_internal;
+out_unlock:
+ xfs_aio_write_newsize_update(ip);
+ xfs_rw_iunlock(ip, iolock);
+ return ret;
+}
- XFS_STATS_ADD(xs_write_bytes, ret);
+STATIC long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ long error;
+ loff_t new_size = 0;
+ xfs_flock64_t bf;
+ xfs_inode_t *ip = XFS_I(inode);
+ int cmd = XFS_IOC_RESVSP;
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error2;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
- xfs_iunlock(ip, iolock);
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
+ bf.l_whence = 0;
+ bf.l_start = offset;
+ bf.l_len = len;
- error2 = filemap_write_and_wait_range(mapping, pos, end);
- if (!error)
- error = error2;
- if (need_i_mutex)
- mutex_lock(&inode->i_mutex);
- xfs_ilock(ip, iolock);
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
- error2 = -xfs_file_fsync(file,
- (file->f_flags & __O_SYNC) ? 0 : 1);
- if (!error)
- error = error2;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = XFS_IOC_UNRESVSP;
+
+ /* check the new inode size is valid before allocating */
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
}
- out_unlock_internal:
- if (ip->i_new_size) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- ip->i_new_size = 0;
- /*
- * If this was a direct or synchronous I/O that failed (such
- * as ENOSPC) then part of the I/O may have been written to
- * disk before the error occured. In this case the on-disk
- * file size may have been adjusted beyond the in-memory file
- * size and now needs to be truncated back.
- */
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+ if (error)
+ goto out_unlock;
+
+ /* Change file size if needed */
+ if (new_size) {
+ struct iattr iattr;
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = new_size;
+ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
- xfs_iunlock(ip, iolock);
- out_unlock_mutex:
- if (need_i_mutex)
- mutex_unlock(&inode->i_mutex);
- return -error;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
}
+
STATIC int
xfs_file_open(
struct inode *inode,
@@ -921,6 +1055,7 @@ const struct file_operations xfs_file_operations = {
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .fallocate = xfs_file_fallocate,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..b06ede1d0bed 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
#include "xfs_dfrag.h"
#include "xfs_fsops.h"
#include "xfs_vnodeops.h"
+#include "xfs_discard.h"
#include "xfs_quota.h"
#include "xfs_inode_item.h"
#include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
trace_xfs_file_ioctl(ip);
switch (cmd) {
+ case FITRIM:
+ return xfs_ioc_trim(mp, arg);
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a2973..bd5727852fd6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -46,7 +46,6 @@
#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/falloc.h>
#include <linux/fiemap.h>
#include <linux/slab.h>
@@ -505,58 +504,6 @@ xfs_vn_setattr(
return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
}
-STATIC long
-xfs_vn_fallocate(
- struct inode *inode,
- int mode,
- loff_t offset,
- loff_t len)
-{
- long error;
- loff_t new_size = 0;
- xfs_flock64_t bf;
- xfs_inode_t *ip = XFS_I(inode);
-
- /* preallocation on directories not yet supported */
- error = -ENODEV;
- if (S_ISDIR(inode->i_mode))
- goto out_error;
-
- bf.l_whence = 0;
- bf.l_start = offset;
- bf.l_len = len;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- /* check the new inode size is valid before allocating */
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
- if (error)
- goto out_unlock;
- }
-
- error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
- 0, XFS_ATTR_NOLOCK);
- if (error)
- goto out_unlock;
-
- /* Change file size if needed */
- if (new_size) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = new_size;
- error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
- }
-
-out_unlock:
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-out_error:
- return error;
-}
-
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
/*
@@ -650,7 +597,6 @@ static const struct inode_operations xfs_inode_operations = {
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
- .fallocate = xfs_vn_fallocate,
.fiemap = xfs_vn_fiemap,
};
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c51faaa5e291..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -606,7 +606,8 @@ xfs_blkdev_get(
{
int error = 0;
- *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+ *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ mp);
if (IS_ERR(*bdevp)) {
error = PTR_ERR(*bdevp);
printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -620,7 +621,7 @@ xfs_blkdev_put(
struct block_device *bdev)
{
if (bdev)
- close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
/*
@@ -947,7 +948,7 @@ out_reclaim:
* Slab object creation initialisation for the XFS inode.
* This covers only the idempotent fields in the XFS inode;
* all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly intialise
+ * from the slab. This avoids the need to repeatedly initialise
* fields in the xfs inode that left in the initialise state
* when freeing the inode.
*/
@@ -1413,7 +1414,7 @@ xfs_fs_freeze(
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
- return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+ return -xfs_fs_log_dummy(mp);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a02480de9759..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -362,7 +362,7 @@ xfs_quiesce_data(
/* mark the log as covered if needed */
if (xfs_log_need_covered(mp))
- error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+ error2 = xfs_fs_log_dummy(mp);
/* flush data-only devices */
if (mp->m_rtdev_targp)
@@ -503,13 +503,14 @@ xfs_sync_worker(
int error;
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
- xfs_log_force(mp, 0);
- xfs_reclaim_inodes(mp, 0);
/* dgc: errors ignored here */
- error = xfs_qm_sync(mp, SYNC_TRYLOCK);
if (mp->m_super->s_frozen == SB_UNFROZEN &&
xfs_log_need_covered(mp))
- error = xfs_fs_log_dummy(mp, 0);
+ error = xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+ xfs_reclaim_inodes(mp, 0);
+ error = xfs_qm_sync(mp, SYNC_TRYLOCK);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
#include "xfs.h"
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
+#include "xfs_error.h"
static struct ctl_table_header *xfs_table_header;
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
return ret;
}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+ ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, *valp = ctl->data;
+
+ ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ xfs_panic_mask = *valp;
+#ifdef DEBUG
+ xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+ }
+ return ret;
+}
#endif /* CONFIG_PROC_FS */
static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
.data = &xfs_params.panic_mask.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_panic_mask_proc_handler,
.extra1 = &xfs_params.panic_mask.min,
.extra2 = &xfs_params.panic_mask.max
},
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 647af2a2e7aa..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 975aa10e1a47..0df88897ef84 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -25,86 +25,78 @@
#include "xfs_mount.h"
#include "xfs_error.h"
-static char message[1024]; /* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL 7
-#define XFS_ERR_MASK ((1 << 3) - 1)
-static const char * const err_level[XFS_MAX_ERR_LEVEL+1] =
- {KERN_EMERG, KERN_ALERT, KERN_CRIT,
- KERN_ERR, KERN_WARNING, KERN_NOTICE,
- KERN_INFO, KERN_DEBUG};
-
void
-cmn_err(register int level, char *fmt, ...)
+cmn_err(
+ const char *lvl,
+ const char *fmt,
+ ...)
{
- char *fp = fmt;
- int len;
- ulong flags;
- va_list ap;
-
- level &= XFS_ERR_MASK;
- if (level > XFS_MAX_ERR_LEVEL)
- level = XFS_MAX_ERR_LEVEL;
- spin_lock_irqsave(&xfs_err_lock,flags);
- va_start(ap, fmt);
- if (*fmt == '!') fp++;
- len = vsnprintf(message, sizeof(message), fp, ap);
- if (len >= sizeof(message))
- len = sizeof(message) - 1;
- if (message[len-1] == '\n')
- message[len-1] = 0;
- printk("%s%s\n", err_level[level], message);
- va_end(ap);
- spin_unlock_irqrestore(&xfs_err_lock,flags);
- BUG_ON(level == CE_PANIC);
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%s%pV", lvl, &vaf);
+ va_end(args);
+
+ BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
}
void
-xfs_fs_vcmn_err(
- int level,
+xfs_fs_cmn_err(
+ const char *lvl,
struct xfs_mount *mp,
- char *fmt,
- va_list ap)
+ const char *fmt,
+ ...)
{
- unsigned long flags;
- int len = 0;
+ struct va_format vaf;
+ va_list args;
- level &= XFS_ERR_MASK;
- if (level > XFS_MAX_ERR_LEVEL)
- level = XFS_MAX_ERR_LEVEL;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
- spin_lock_irqsave(&xfs_err_lock,flags);
+ printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
+ va_end(args);
- if (mp) {
- len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+ BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
+}
+
+/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
+void
+xfs_cmn_err(
+ int panic_tag,
+ const char *lvl,
+ struct xfs_mount *mp,
+ const char *fmt,
+ ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int do_panic = 0;
- /*
- * Skip the printk if we can't print anything useful
- * due to an over-long device name.
- */
- if (len >= sizeof(message))
- goto out;
+ if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+ printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
+ do_panic = 1;
}
- len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
- if (len >= sizeof(message))
- len = sizeof(message) - 1;
- if (message[len-1] == '\n')
- message[len-1] = 0;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
- printk("%s%s\n", err_level[level], message);
- out:
- spin_unlock_irqrestore(&xfs_err_lock,flags);
+ printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
+ va_end(args);
- BUG_ON(level == CE_PANIC);
+ BUG_ON(do_panic);
}
void
assfail(char *expr, char *file, int line)
{
- printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
+ printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
+ file, line);
BUG();
}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index d2d20462fd4f..05699f67d475 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,15 +20,22 @@
#include <stdarg.h>
-#define CE_DEBUG 7 /* debug */
-#define CE_CONT 6 /* continuation */
-#define CE_NOTE 5 /* notice */
-#define CE_WARN 4 /* warning */
-#define CE_ALERT 1 /* alert */
-#define CE_PANIC 0 /* panic */
-
-extern void cmn_err(int, char *, ...)
- __attribute__ ((format (printf, 2, 3)));
+struct xfs_mount;
+
+#define CE_DEBUG KERN_DEBUG
+#define CE_CONT KERN_INFO
+#define CE_NOTE KERN_NOTICE
+#define CE_WARN KERN_WARNING
+#define CE_ALERT KERN_ALERT
+#define CE_PANIC KERN_EMERG
+
+void cmn_err(const char *lvl, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
+ const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
+void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
+ const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
+
extern void assfail(char *expr, char *f, int l);
#define ASSERT_ALWAYS(expr) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index fa8723f5870a..f3227984a9bf 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,10 +41,6 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-static int
-xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len);
-
/*
* Prototypes for per-ag allocation routines
*/
@@ -94,7 +90,7 @@ xfs_alloc_lookup_ge(
* Lookup the first record less than or equal to [bno, len]
* in the btree given by cur.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_lookup_le(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -127,7 +123,7 @@ xfs_alloc_update(
/*
* Get the data from the pointed-to record.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_get_rec(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t *bno, /* output: starting block of extent */
@@ -2615,7 +2611,7 @@ restart:
* will require a synchronous transaction, but it can still be
* used to distinguish between a partial or exact match.
*/
-static int
+int
xfs_alloc_busy_search(
struct xfs_mount *mp,
xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 895009a97271..0ab56b32c7eb 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,6 +19,7 @@
#define __XFS_ALLOC_H__
struct xfs_buf;
+struct xfs_btree_cur;
struct xfs_mount;
struct xfs_perag;
struct xfs_trans;
@@ -118,16 +119,16 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag);
#ifdef __KERNEL__
-
void
-xfs_alloc_busy_insert(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
void
xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
#endif /* __KERNEL__ */
/*
@@ -205,4 +206,18 @@ xfs_free_extent(
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len); /* length of extent */
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat); /* output: success/failure */
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index ed2b65f3f8b9..98c6f73b6752 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -141,7 +141,6 @@ xfs_buf_item_log_check(
#define xfs_buf_item_log_check(x)
#endif
-STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
/*
@@ -959,128 +958,76 @@ xfs_buf_do_callbacks(
*/
void
xfs_buf_iodone_callbacks(
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
- xfs_log_item_t *lip;
- static ulong lasttime;
- static xfs_buftarg_t *lasttarg;
- xfs_mount_t *mp;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_mount *mp = lip->li_mountp;
+ static ulong lasttime;
+ static xfs_buftarg_t *lasttarg;
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ if (likely(!XFS_BUF_GETERROR(bp)))
+ goto do_callbacks;
- if (XFS_BUF_GETERROR(bp) != 0) {
- /*
- * If we've already decided to shutdown the filesystem
- * because of IO errors, there's no point in giving this
- * a retry.
- */
- mp = lip->li_mountp;
- if (XFS_FORCED_SHUTDOWN(mp)) {
- ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
- XFS_BUF_SUPER_STALE(bp);
- trace_xfs_buf_item_iodone(bp, _RET_IP_);
- xfs_buf_do_callbacks(bp);
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
- XFS_BUF_CLR_IODONE_FUNC(bp);
- xfs_buf_ioend(bp, 0);
- return;
- }
+ /*
+ * If we've already decided to shutdown the filesystem because of
+ * I/O errors, there's no point in giving this a retry.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ XFS_BUF_SUPER_STALE(bp);
+ trace_xfs_buf_item_iodone(bp, _RET_IP_);
+ goto do_callbacks;
+ }
- if ((XFS_BUF_TARGET(bp) != lasttarg) ||
- (time_after(jiffies, (lasttime + 5*HZ)))) {
- lasttime = jiffies;
- cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
- " block 0x%llx in %s",
- XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
- (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
- }
- lasttarg = XFS_BUF_TARGET(bp);
+ if (XFS_BUF_TARGET(bp) != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+ " block 0x%llx in %s",
+ XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+ (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+ }
+ lasttarg = XFS_BUF_TARGET(bp);
- if (XFS_BUF_ISASYNC(bp)) {
- /*
- * If the write was asynchronous then noone will be
- * looking for the error. Clear the error state
- * and write the buffer out again delayed write.
- *
- * XXXsup This is OK, so long as we catch these
- * before we start the umount; we don't want these
- * DELWRI metadata bufs to be hanging around.
- */
- XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
-
- if (!(XFS_BUF_ISSTALE(bp))) {
- XFS_BUF_DELAYWRITE(bp);
- XFS_BUF_DONE(bp);
- XFS_BUF_SET_START(bp);
- }
- ASSERT(XFS_BUF_IODONE_FUNC(bp));
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
- xfs_buf_relse(bp);
- } else {
- /*
- * If the write of the buffer was not asynchronous,
- * then we want to make sure to return the error
- * to the caller of bwrite(). Because of this we
- * cannot clear the B_ERROR state at this point.
- * Instead we install a callback function that
- * will be called when the buffer is released, and
- * that routine will clear the error state and
- * set the buffer to be written out again after
- * some delay.
- */
- /* We actually overwrite the existing b-relse
- function at times, but we're gonna be shutting down
- anyway. */
- XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+ /*
+ * If the write was asynchronous then noone will be looking for the
+ * error. Clear the error state and write the buffer out again.
+ *
+ * During sync or umount we'll write all pending buffers again
+ * synchronous, which will catch these errors if they keep hanging
+ * around.
+ */
+ if (XFS_BUF_ISASYNC(bp)) {
+ XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+
+ if (!XFS_BUF_ISSTALE(bp)) {
+ XFS_BUF_DELAYWRITE(bp);
XFS_BUF_DONE(bp);
- XFS_BUF_FINISH_IOWAIT(bp);
+ XFS_BUF_SET_START(bp);
}
+ ASSERT(XFS_BUF_IODONE_FUNC(bp));
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+ xfs_buf_relse(bp);
return;
}
- xfs_buf_do_callbacks(bp);
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
- XFS_BUF_CLR_IODONE_FUNC(bp);
- xfs_buf_ioend(bp, 0);
-}
-
-/*
- * This is a callback routine attached to a buffer which gets an error
- * when being written out synchronously.
- */
-STATIC void
-xfs_buf_error_relse(
- xfs_buf_t *bp)
-{
- xfs_log_item_t *lip;
- xfs_mount_t *mp;
-
- lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
- mp = (xfs_mount_t *)lip->li_mountp;
- ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
-
+ /*
+ * If the write of the buffer was synchronous, we want to make
+ * sure to return the error to the caller of xfs_bwrite().
+ */
XFS_BUF_STALE(bp);
XFS_BUF_DONE(bp);
XFS_BUF_UNDELAYWRITE(bp);
- XFS_BUF_ERROR(bp,0);
trace_xfs_buf_error_relse(bp, _RET_IP_);
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- if (! XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- /*
- * We have to unpin the pinned buffers so do the
- * callbacks.
- */
+do_callbacks:
xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
- XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
- xfs_buf_relse(bp);
+ xfs_buf_ioend(bp, 0);
}
-
/*
* This is the iodone() function for buffers which have been
* logged. It is called when they are eventually flushed out.
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c78cc6a3d87c..4c7db74a05f7 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -152,37 +152,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
}
#endif /* DEBUG */
-
-void
-xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- xfs_fs_vcmn_err(level, mp, fmt, ap);
- va_end(ap);
-}
-
-void
-xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
-{
- va_list ap;
-
-#ifdef DEBUG
- xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-
- if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
- && (level & CE_ALERT)) {
- level &= ~CE_ALERT;
- level |= CE_PANIC;
- cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
- }
- va_start(ap, fmt);
- xfs_fs_vcmn_err(level, mp, fmt, ap);
- va_end(ap);
-}
-
void
xfs_error_report(
const char *tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index f338847f80b8..10dce5475f02 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -136,8 +136,8 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
(rf))))
-extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
-extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
#else
#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
#define xfs_errortag_add(tag, mp) (ENOSYS)
@@ -162,21 +162,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
struct xfs_mount;
-extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
- char *fmt, va_list ap)
- __attribute__ ((format (printf, 3, 0)));
-extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
- char *fmt, ...)
- __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
-
extern void xfs_hex_dump(void *p, int length);
#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
#define xfs_fs_mount_cmn_err(f, fmt, args...) \
- ((f & XFS_MFSI_QUIET)? (void)0 : cmn_err(CE_WARN, "XFS: " fmt, ## args))
+ do { \
+ if (!(f & XFS_MFSI_QUIET)) \
+ cmn_err(CE_WARN, "XFS: " fmt, ## args); \
+ } while (0)
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f56d30e8040c..cec89dd5d7d2 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -612,12 +612,13 @@ out:
*
* We cannot use an inode here for this - that will push dirty state back up
* into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead.
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
*/
int
xfs_fs_log_dummy(
- xfs_mount_t *mp,
- int flags)
+ xfs_mount_t *mp)
{
xfs_trans_t *tp;
int error;
@@ -632,8 +633,7 @@ xfs_fs_log_dummy(
/* log the UUID because it is an unchanging field */
xfs_mod_sb(tp, XFS_SB_UUID);
- if (flags & SYNC_WAIT)
- xfs_trans_set_sync(tp);
+ xfs_trans_set_sync(tp);
return xfs_trans_commit(tp, 0);
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index a786c5212c1e..1b6a98b66886 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0bf24b11d0c4..ae6fef1ff563 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -377,7 +377,7 @@ xfs_log_mount(
cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
else {
cmn_err(CE_NOTE,
- "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
+ "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.",
mp->m_fsname);
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 204d8e5fa7fa..aa0ebb776903 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3800,7 +3800,7 @@ xlog_recover_finish(
log->l_flags &= ~XLOG_RECOVERY_NEEDED;
} else {
cmn_err(CE_DEBUG,
- "!Ending clean XFS mount for filesystem: %s\n",
+ "Ending clean XFS mount for filesystem: %s\n",
log->l_mp->m_fsname);
}
return 0;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f80a067a4658..33dbc4e0ad62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1137,7 +1137,7 @@ out_undo_fdblocks:
if (blkdelta)
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
out:
- ASSERT(error = 0);
+ ASSERT(error == 0);
return;
}